Initial commit: Odoo 18 RAG stack
Scraper, indexer, and FastAPI query service for Retrieval-Augmented Generation over Odoo 18 documentation. Uses Qdrant + Ollama (nomic-embed-text + llama3.1). Integrates with ActiveBlue PeerBus agent interface. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
18
.gitignore
vendored
Normal file
18
.gitignore
vendored
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*.egg-info/
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
.env
|
||||||
|
|
||||||
|
# Data (scraped docs — too large for git, regenerate with scraper)
|
||||||
|
data/raw/
|
||||||
|
data/*.jsonl
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
.docker/
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
16
Dockerfile
Normal file
16
Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libxml2 libxslt1.1 curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY scraper/ ./scraper/
|
||||||
|
COPY indexer/ ./indexer/
|
||||||
|
COPY api/ ./api/
|
||||||
|
|
||||||
|
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
|
||||||
127
README.md
Normal file
127
README.md
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
# odoo18-rag
|
||||||
|
|
||||||
|
Retrieval-Augmented Generation over the full Odoo 18 documentation.
|
||||||
|
Built for the ActiveBlue AI agent stack.
|
||||||
|
|
||||||
|
## Stack
|
||||||
|
|
||||||
|
| Component | What it does |
|
||||||
|
|---|---|
|
||||||
|
| `scraper/` | Crawls odoo.com/documentation/18.0, outputs clean JSONL |
|
||||||
|
| `indexer/` | Chunks pages, embeds with `nomic-embed-text`, loads Qdrant |
|
||||||
|
| `api/` | FastAPI — `/ask`, `/ask/stream`, `/agent/ask`, `/health` |
|
||||||
|
| Qdrant | Vector database (Docker) |
|
||||||
|
| Ollama @ `miaai:11434` | Embeddings + generation (local, HIPAA-safe) |
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Pull the embedding model on miaai
|
||||||
|
ollama pull nomic-embed-text
|
||||||
|
|
||||||
|
# 2. Start Qdrant + RAG API
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# 3. Scrape the docs (~800 pages, ~20 min)
|
||||||
|
docker compose run --rm scraper
|
||||||
|
|
||||||
|
# 4. Index into Qdrant (~30-40 min)
|
||||||
|
docker compose run --rm indexer
|
||||||
|
|
||||||
|
# 5. Test
|
||||||
|
curl http://localhost:8000/health
|
||||||
|
curl -X POST http://localhost:8000/ask \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"question": "How do I run a payroll batch in Odoo 18?"}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Endpoints
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| GET | `/health` | Qdrant + Ollama connectivity |
|
||||||
|
| GET | `/stats` | Vector count, models in use |
|
||||||
|
| GET | `/modules` | List indexed Odoo modules |
|
||||||
|
| POST | `/ask` | Blocking answer + sources |
|
||||||
|
| POST | `/ask/stream` | SSE token stream |
|
||||||
|
| POST | `/agent/ask` | ActiveBlue PeerBus integration |
|
||||||
|
|
||||||
|
### Ask with module filter
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:8000/ask \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"question": "How do reordering rules work?", "module": "inventory"}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Streaming
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -N -X POST http://localhost:8000/ask/stream \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"question": "Explain the Quote-to-Cash workflow"}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Agent integration
|
||||||
|
|
||||||
|
```python
|
||||||
|
from api.odoo_rag_agent import OdooRagAgent
|
||||||
|
|
||||||
|
agent = OdooRagAgent(rag_url="http://localhost:8000")
|
||||||
|
|
||||||
|
# Generic
|
||||||
|
result = await agent.ask("How do I configure NACHA payments?")
|
||||||
|
|
||||||
|
# Module-scoped
|
||||||
|
result = await agent.ask_payroll("How do I generate a payslip batch?")
|
||||||
|
result = await agent.ask_accounting("What is the chart of accounts?")
|
||||||
|
result = await agent.ask_inventory("How does MTO work?")
|
||||||
|
|
||||||
|
# Streaming
|
||||||
|
async for token in agent.ask_stream("Explain the CRM pipeline"):
|
||||||
|
print(token, end="", flush=True)
|
||||||
|
|
||||||
|
# PeerBus
|
||||||
|
response = await agent.handle_peer_message({
|
||||||
|
"action": "ask",
|
||||||
|
"payload": {"question": "How do I set up taxes?", "module": "accounting"},
|
||||||
|
"request_id": "req-001"
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
## Re-indexing
|
||||||
|
|
||||||
|
Odoo releases doc updates regularly. Re-index to stay current:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose run --rm scraper
|
||||||
|
docker compose run --rm indexer python /app/indexer/indexer.py --reset
|
||||||
|
```
|
||||||
|
|
||||||
|
Or add a monthly cron on the host:
|
||||||
|
|
||||||
|
```cron
|
||||||
|
0 3 1 * * cd /opt/odoo18-rag && docker compose run --rm scraper && docker compose run --rm indexer python /app/indexer/indexer.py --reset
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scraper options
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Single module only
|
||||||
|
docker compose run --rm scraper python /app/scraper/scraper.py --module accounting
|
||||||
|
|
||||||
|
# Quick test (first 50 pages)
|
||||||
|
docker compose run --rm scraper python /app/scraper/scraper.py --limit 50
|
||||||
|
```
|
||||||
|
|
||||||
|
## Environment variables
|
||||||
|
|
||||||
|
All configurable via `docker-compose.yml` environment section:
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `OLLAMA_URL` | `http://miaai:11434` | Ollama endpoint |
|
||||||
|
| `QDRANT_URL` | `http://qdrant:6333` | Qdrant endpoint |
|
||||||
|
| `EMBED_MODEL` | `nomic-embed-text` | Embedding model |
|
||||||
|
| `GEN_MODEL` | `llama3.1` | Generation model |
|
||||||
|
| `COLLECTION_NAME` | `odoo18_docs` | Qdrant collection |
|
||||||
316
api/main.py
Normal file
316
api/main.py
Normal file
@@ -0,0 +1,316 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Odoo 18 RAG Query API
|
||||||
|
======================
|
||||||
|
FastAPI service — embeds the question, retrieves top-K chunks from Qdrant,
|
||||||
|
builds a prompt, and streams or returns the answer from Ollama.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
POST /ask blocking answer + sources
|
||||||
|
POST /ask/stream Server-Sent Events token stream
|
||||||
|
POST /agent/ask ActiveBlue AI agent integration
|
||||||
|
GET /health connectivity check
|
||||||
|
GET /modules list indexed modules
|
||||||
|
GET /stats collection stats
|
||||||
|
|
||||||
|
Run:
|
||||||
|
uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
||||||
|
from typing import AsyncIterator
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
log = logging.getLogger("odoo18_rag")
|
||||||
|
|
||||||
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://miaai:11434")
|
||||||
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333")
|
||||||
|
EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text")
|
||||||
|
GEN_MODEL = os.getenv("GEN_MODEL", "llama3.1")
|
||||||
|
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs")
|
||||||
|
TOP_K = 6
|
||||||
|
MAX_CONTEXT = 4000
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """\
|
||||||
|
You are an expert Odoo 18 consultant for ActiveBlue LLC, an MSP serving \
|
||||||
|
medical and dental practices. You have deep knowledge of all Odoo 18 modules: \
|
||||||
|
Finance, Accounting, Inventory, Manufacturing, Purchase, Sales, CRM, HR, \
|
||||||
|
Payroll, eCommerce, Helpdesk, Project, and more.
|
||||||
|
|
||||||
|
Answer questions clearly and concisely using the provided documentation context. \
|
||||||
|
Use numbered steps when explaining procedures. Always mention the Odoo menu path \
|
||||||
|
when explaining navigation. If the context doesn't cover the question fully, say \
|
||||||
|
so and answer from general knowledge.\
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Models ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class AskRequest(BaseModel):
|
||||||
|
question: str = Field(..., min_length=5, max_length=2000)
|
||||||
|
module: str | None = Field(None, description="Filter to one Odoo module")
|
||||||
|
model: str | None = Field(None, description="Override the LLM model")
|
||||||
|
top_k: int = Field(TOP_K, ge=1, le=20)
|
||||||
|
temperature: float = Field(0.3, ge=0.0, le=1.0)
|
||||||
|
|
||||||
|
|
||||||
|
class Source(BaseModel):
|
||||||
|
url: str
|
||||||
|
title: str
|
||||||
|
module: str
|
||||||
|
section: str
|
||||||
|
|
||||||
|
|
||||||
|
class AskResponse(BaseModel):
|
||||||
|
answer: str
|
||||||
|
sources: list[Source]
|
||||||
|
model: str
|
||||||
|
question: str
|
||||||
|
|
||||||
|
|
||||||
|
# ── App ───────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Odoo 18 RAG API",
|
||||||
|
description="Retrieval-Augmented Generation over Odoo 18 documentation",
|
||||||
|
version="1.0.0",
|
||||||
|
)
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
qdrant = QdrantClient(url=QDRANT_URL)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def embed_query(text: str) -> list:
|
||||||
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{OLLAMA_URL}/api/embed",
|
||||||
|
json={"model": EMBED_MODEL, "input": [text]},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
embeddings = resp.json().get("embeddings", [])
|
||||||
|
if not embeddings:
|
||||||
|
raise HTTPException(500, "Empty embedding response from Ollama")
|
||||||
|
return embeddings[0]
|
||||||
|
|
||||||
|
|
||||||
|
def retrieve(vector: list, top_k: int, module: str | None) -> list:
|
||||||
|
query_filter = None
|
||||||
|
if module:
|
||||||
|
query_filter = Filter(
|
||||||
|
must=[FieldCondition(key="module", match=MatchValue(value=module))]
|
||||||
|
)
|
||||||
|
results = qdrant.search(
|
||||||
|
collection_name=COLLECTION_NAME,
|
||||||
|
query_vector=vector,
|
||||||
|
limit=top_k,
|
||||||
|
query_filter=query_filter,
|
||||||
|
with_payload=True,
|
||||||
|
)
|
||||||
|
return [hit.payload for hit in results]
|
||||||
|
|
||||||
|
|
||||||
|
def build_prompt(question: str, chunks: list) -> str:
|
||||||
|
context_parts = []
|
||||||
|
char_count = 0
|
||||||
|
for i, chunk in enumerate(chunks, 1):
|
||||||
|
block = (
|
||||||
|
f"[Source {i}: {chunk.get('title', '')} | {chunk.get('section', '')}]\n"
|
||||||
|
f"{chunk.get('text', '')}\n"
|
||||||
|
f"URL: {chunk.get('url', '')}\n"
|
||||||
|
)
|
||||||
|
if char_count + len(block) > MAX_CONTEXT:
|
||||||
|
break
|
||||||
|
context_parts.append(block)
|
||||||
|
char_count += len(block)
|
||||||
|
|
||||||
|
return (
|
||||||
|
f"{SYSTEM_PROMPT}\n\n"
|
||||||
|
f"## Relevant documentation\n\n"
|
||||||
|
f"{'---'.join(context_parts)}\n\n"
|
||||||
|
f"---\n\n"
|
||||||
|
f"## Question\n\n{question}\n\n"
|
||||||
|
f"## Answer\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def dedupe_sources(chunks: list) -> list[Source]:
|
||||||
|
seen = set()
|
||||||
|
sources = []
|
||||||
|
for chunk in chunks:
|
||||||
|
url = chunk.get("url", "")
|
||||||
|
if url not in seen:
|
||||||
|
seen.add(url)
|
||||||
|
sources.append(Source(
|
||||||
|
url=url,
|
||||||
|
title=chunk.get("title", ""),
|
||||||
|
module=chunk.get("module", ""),
|
||||||
|
section=chunk.get("section", ""),
|
||||||
|
))
|
||||||
|
return sources
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_blocking(prompt: str, model: str, temperature: float) -> str:
|
||||||
|
async with httpx.AsyncClient(timeout=120) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{OLLAMA_URL}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": temperature, "num_ctx": 8192},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json().get("response", "").strip()
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_stream(prompt: str, model: str, temperature: float) -> AsyncIterator[str]:
|
||||||
|
async with httpx.AsyncClient(timeout=120) as client:
|
||||||
|
async with client.stream(
|
||||||
|
"POST",
|
||||||
|
f"{OLLAMA_URL}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": True,
|
||||||
|
"options": {"temperature": temperature, "num_ctx": 8192},
|
||||||
|
},
|
||||||
|
) as resp:
|
||||||
|
async for line in resp.aiter_lines():
|
||||||
|
if line.strip():
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
token = data.get("response", "")
|
||||||
|
if token:
|
||||||
|
yield token
|
||||||
|
if data.get("done"):
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
# ── Endpoints ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
status = {"api": "ok", "qdrant": "unknown", "ollama": "unknown"}
|
||||||
|
try:
|
||||||
|
info = qdrant.get_collection(COLLECTION_NAME)
|
||||||
|
status["qdrant"] = f"ok ({info.points_count} vectors)"
|
||||||
|
except Exception as e:
|
||||||
|
status["qdrant"] = f"error: {e}"
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=5) as client:
|
||||||
|
resp = await client.get(f"{OLLAMA_URL}/api/tags")
|
||||||
|
models = [m["name"] for m in resp.json().get("models", [])]
|
||||||
|
status["ollama"] = f"ok ({len(models)} models)"
|
||||||
|
except Exception as e:
|
||||||
|
status["ollama"] = f"error: {e}"
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/modules")
|
||||||
|
async def list_modules():
|
||||||
|
try:
|
||||||
|
result = qdrant.scroll(collection_name=COLLECTION_NAME, limit=1000, with_payload=["module"])
|
||||||
|
modules = sorted(set(p.payload.get("module", "general") for p in result[0]))
|
||||||
|
return {"modules": modules}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/stats")
|
||||||
|
async def stats():
|
||||||
|
try:
|
||||||
|
info = qdrant.get_collection(COLLECTION_NAME)
|
||||||
|
return {
|
||||||
|
"collection": COLLECTION_NAME,
|
||||||
|
"vectors": info.points_count,
|
||||||
|
"vector_size": 768,
|
||||||
|
"embed_model": EMBED_MODEL,
|
||||||
|
"gen_model": GEN_MODEL,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ask", response_model=AskResponse)
|
||||||
|
async def ask(req: AskRequest):
|
||||||
|
model = req.model or GEN_MODEL
|
||||||
|
|
||||||
|
try:
|
||||||
|
vector = await embed_query(req.question)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(500, f"Embedding failed: {e}")
|
||||||
|
|
||||||
|
chunks = retrieve(vector, req.top_k, req.module)
|
||||||
|
if not chunks:
|
||||||
|
raise HTTPException(404, "No relevant documentation found.")
|
||||||
|
|
||||||
|
prompt = build_prompt(req.question, chunks)
|
||||||
|
|
||||||
|
try:
|
||||||
|
answer = await generate_blocking(prompt, model, req.temperature)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(500, f"Generation failed: {e}")
|
||||||
|
|
||||||
|
return AskResponse(
|
||||||
|
answer=answer,
|
||||||
|
sources=dedupe_sources(chunks),
|
||||||
|
model=model,
|
||||||
|
question=req.question,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ask/stream")
|
||||||
|
async def ask_stream(req: AskRequest):
|
||||||
|
model = req.model or GEN_MODEL
|
||||||
|
|
||||||
|
try:
|
||||||
|
vector = await embed_query(req.question)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(500, f"Embedding failed: {e}")
|
||||||
|
|
||||||
|
chunks = retrieve(vector, req.top_k, req.module)
|
||||||
|
if not chunks:
|
||||||
|
raise HTTPException(404, "No relevant documentation found.")
|
||||||
|
|
||||||
|
prompt = build_prompt(req.question, chunks)
|
||||||
|
sources = [s.model_dump() for s in dedupe_sources(chunks)]
|
||||||
|
|
||||||
|
async def sse():
|
||||||
|
async for token in generate_stream(prompt, model, req.temperature):
|
||||||
|
yield f"data: {json.dumps({'type': 'token', 'content': token})}\n\n"
|
||||||
|
yield f"data: {json.dumps({'type': 'sources', 'sources': sources})}\n\n"
|
||||||
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
|
return StreamingResponse(sse(), media_type="text/event-stream")
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/agent/ask")
|
||||||
|
async def agent_ask(req: AskRequest):
|
||||||
|
"""ActiveBlue AI agent endpoint — compatible with PeerBus message format."""
|
||||||
|
result = await ask(req)
|
||||||
|
return {
|
||||||
|
"answer": result.answer,
|
||||||
|
"sources": [s.url for s in result.sources],
|
||||||
|
"module_context": req.module,
|
||||||
|
"model_used": result.model,
|
||||||
|
}
|
||||||
147
api/odoo_rag_agent.py
Normal file
147
api/odoo_rag_agent.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""
|
||||||
|
ActiveBlue AI Agent — Odoo 18 RAG Specialist
|
||||||
|
=============================================
|
||||||
|
Drop-in specialist agent for the ActiveBlue AI system.
|
||||||
|
Implements the PeerBus interface defined in ACTIVEBLUE_AI_SPEC.md.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from api.odoo_rag_agent import OdooRagAgent
|
||||||
|
|
||||||
|
agent = OdooRagAgent(rag_url="http://localhost:8000")
|
||||||
|
result = await agent.ask("How do I run a payroll batch?")
|
||||||
|
print(result["answer"])
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import httpx
|
||||||
|
import logging
|
||||||
|
from typing import AsyncIterator
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class OdooRagAgent:
|
||||||
|
name = "odoo18_rag"
|
||||||
|
description = "Answers Odoo 18 questions using RAG over official documentation"
|
||||||
|
capabilities = [
|
||||||
|
"odoo_how_to",
|
||||||
|
"odoo_configuration",
|
||||||
|
"odoo_troubleshooting",
|
||||||
|
"odoo_workflow",
|
||||||
|
]
|
||||||
|
privacy_mode = "local" # uses local Ollama — HIPAA safe
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rag_url: str = "http://localhost:8000",
|
||||||
|
timeout: int = 120,
|
||||||
|
default_model: str | None = None,
|
||||||
|
):
|
||||||
|
self.rag_url = rag_url.rstrip("/")
|
||||||
|
self.timeout = timeout
|
||||||
|
self.default_model = default_model
|
||||||
|
|
||||||
|
async def ask(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
module: str | None = None,
|
||||||
|
top_k: int = 6,
|
||||||
|
temperature: float = 0.3,
|
||||||
|
) -> dict:
|
||||||
|
payload = {"question": question, "top_k": top_k, "temperature": temperature}
|
||||||
|
if module:
|
||||||
|
payload["module"] = module
|
||||||
|
if self.default_model:
|
||||||
|
payload["model"] = self.default_model
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
resp = await client.post(f"{self.rag_url}/ask", json=payload)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
async def ask_stream(
|
||||||
|
self,
|
||||||
|
question: str,
|
||||||
|
module: str | None = None,
|
||||||
|
top_k: int = 6,
|
||||||
|
temperature: float = 0.3,
|
||||||
|
) -> AsyncIterator[str]:
|
||||||
|
payload = {"question": question, "top_k": top_k, "temperature": temperature}
|
||||||
|
if module:
|
||||||
|
payload["module"] = module
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
async with client.stream("POST", f"{self.rag_url}/ask/stream", json=payload) as resp:
|
||||||
|
async for line in resp.aiter_lines():
|
||||||
|
if line.startswith("data: "):
|
||||||
|
data_str = line[6:]
|
||||||
|
if data_str == "[DONE]":
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
data = json.loads(data_str)
|
||||||
|
if data.get("type") == "token":
|
||||||
|
yield data["content"]
|
||||||
|
elif data.get("type") == "sources":
|
||||||
|
yield json.dumps(data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
async def handle_peer_message(self, message: dict) -> dict:
|
||||||
|
"""PeerBus message handler for the ActiveBlue Master AI."""
|
||||||
|
action = message.get("action")
|
||||||
|
payload = message.get("payload", {})
|
||||||
|
req_id = message.get("request_id")
|
||||||
|
|
||||||
|
if action == "ask":
|
||||||
|
result = await self.ask(
|
||||||
|
question = payload.get("question", ""),
|
||||||
|
module = payload.get("module"),
|
||||||
|
top_k = payload.get("top_k", 6),
|
||||||
|
temperature = payload.get("temperature", 0.3),
|
||||||
|
)
|
||||||
|
return {"request_id": req_id, "agent": self.name, "status": "ok", "result": result}
|
||||||
|
|
||||||
|
elif action == "capabilities":
|
||||||
|
return {
|
||||||
|
"request_id": req_id,
|
||||||
|
"agent": self.name,
|
||||||
|
"capabilities": self.capabilities,
|
||||||
|
"description": self.description,
|
||||||
|
"privacy_mode": self.privacy_mode,
|
||||||
|
}
|
||||||
|
|
||||||
|
elif action == "health":
|
||||||
|
return await self.health()
|
||||||
|
|
||||||
|
return {"request_id": req_id, "agent": self.name, "status": "error", "error": f"Unknown action: {action}"}
|
||||||
|
|
||||||
|
async def health(self) -> dict:
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=5) as client:
|
||||||
|
resp = await client.get(f"{self.rag_url}/health")
|
||||||
|
return {"agent": self.name, "status": "ok", "rag": resp.json()}
|
||||||
|
except Exception as e:
|
||||||
|
return {"agent": self.name, "status": "error", "error": str(e)}
|
||||||
|
|
||||||
|
# ── Module convenience wrappers ───────────────────────────────────────────
|
||||||
|
|
||||||
|
async def ask_accounting(self, question: str) -> dict:
|
||||||
|
return await self.ask(question, module="accounting")
|
||||||
|
|
||||||
|
async def ask_payroll(self, question: str) -> dict:
|
||||||
|
return await self.ask(question, module="payroll")
|
||||||
|
|
||||||
|
async def ask_inventory(self, question: str) -> dict:
|
||||||
|
return await self.ask(question, module="inventory")
|
||||||
|
|
||||||
|
async def ask_crm(self, question: str) -> dict:
|
||||||
|
return await self.ask(question, module="crm")
|
||||||
|
|
||||||
|
async def ask_hr(self, question: str) -> dict:
|
||||||
|
return await self.ask(question, module="employees")
|
||||||
|
|
||||||
|
async def ask_manufacturing(self, question: str) -> dict:
|
||||||
|
return await self.ask(question, module="manufacturing")
|
||||||
|
|
||||||
|
async def ask_helpdesk(self, question: str) -> dict:
|
||||||
|
return await self.ask(question, module="helpdesk")
|
||||||
0
data/.gitkeep
Normal file
0
data/.gitkeep
Normal file
102
docker-compose.yml
Normal file
102
docker-compose.yml
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
version: "3.9"
|
||||||
|
|
||||||
|
# ─── Odoo 18 RAG Stack ────────────────────────────────────────────────────────
|
||||||
|
# rag-api:8000 ──► qdrant:6333 (internal docker network)
|
||||||
|
# rag-api ──► miaai:11434 (direct outbound to Ollama)
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# docker compose up -d # start Qdrant + RAG API
|
||||||
|
# docker compose logs -f rag-api # follow API logs
|
||||||
|
# docker compose run --rm scraper # scrape Odoo 18 docs
|
||||||
|
# docker compose run --rm indexer # embed + load into Qdrant
|
||||||
|
# docker compose run --rm indexer python /app/indexer/indexer.py --reset
|
||||||
|
|
||||||
|
services:
|
||||||
|
|
||||||
|
qdrant:
|
||||||
|
image: qdrant/qdrant:v1.9.0
|
||||||
|
container_name: odoo18-qdrant
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- qdrant_storage:/qdrant/storage
|
||||||
|
ports:
|
||||||
|
- "6333:6333"
|
||||||
|
- "6334:6334"
|
||||||
|
environment:
|
||||||
|
QDRANT__SERVICE__HTTP_PORT: 6333
|
||||||
|
QDRANT__SERVICE__GRPC_PORT: 6334
|
||||||
|
QDRANT__LOG_LEVEL: INFO
|
||||||
|
networks:
|
||||||
|
- rag_net
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:6333/healthz"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
rag-api:
|
||||||
|
build: .
|
||||||
|
container_name: odoo18-rag-api
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
qdrant:
|
||||||
|
condition: service_healthy
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
environment:
|
||||||
|
OLLAMA_URL: "http://miaai:11434"
|
||||||
|
QDRANT_URL: "http://qdrant:6333"
|
||||||
|
COLLECTION_NAME: "odoo18_docs"
|
||||||
|
EMBED_MODEL: "nomic-embed-text"
|
||||||
|
GEN_MODEL: "llama3.1"
|
||||||
|
LOG_LEVEL: "INFO"
|
||||||
|
volumes:
|
||||||
|
- ./data:/app/data
|
||||||
|
extra_hosts:
|
||||||
|
- "miaai:192.168.2.9"
|
||||||
|
networks:
|
||||||
|
- rag_net
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
scraper:
|
||||||
|
build: .
|
||||||
|
container_name: odoo18-scraper
|
||||||
|
profiles: ["scraper"]
|
||||||
|
command: python /app/scraper/scraper.py
|
||||||
|
volumes:
|
||||||
|
- ./data:/app/data
|
||||||
|
networks:
|
||||||
|
- rag_net
|
||||||
|
environment:
|
||||||
|
PYTHONUNBUFFERED: "1"
|
||||||
|
|
||||||
|
indexer:
|
||||||
|
build: .
|
||||||
|
container_name: odoo18-indexer
|
||||||
|
profiles: ["indexer"]
|
||||||
|
command: python /app/indexer/indexer.py
|
||||||
|
depends_on:
|
||||||
|
qdrant:
|
||||||
|
condition: service_healthy
|
||||||
|
volumes:
|
||||||
|
- ./data:/app/data
|
||||||
|
extra_hosts:
|
||||||
|
- "miaai:192.168.2.9"
|
||||||
|
networks:
|
||||||
|
- rag_net
|
||||||
|
environment:
|
||||||
|
OLLAMA_URL: "http://miaai:11434"
|
||||||
|
QDRANT_URL: "http://qdrant:6333"
|
||||||
|
PYTHONUNBUFFERED: "1"
|
||||||
|
|
||||||
|
networks:
|
||||||
|
rag_net:
|
||||||
|
driver: bridge
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
qdrant_storage:
|
||||||
|
name: odoo18_qdrant_storage
|
||||||
244
indexer/indexer.py
Normal file
244
indexer/indexer.py
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Odoo 18 RAG Indexer
|
||||||
|
====================
|
||||||
|
Reads scraped pages, chunks them, embeds with nomic-embed-text via Ollama,
|
||||||
|
and upserts into Qdrant.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python indexer.py # index everything
|
||||||
|
python indexer.py --reset # drop collection and re-index
|
||||||
|
python indexer.py --module accounting
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- Qdrant running: docker compose up -d qdrant
|
||||||
|
- Ollama with model pulled: ollama pull nomic-embed-text
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.models import (
|
||||||
|
Distance, VectorParams, PointStruct,
|
||||||
|
Filter, FieldCondition, MatchValue,
|
||||||
|
)
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://miaai:11434")
|
||||||
|
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||||
|
EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text")
|
||||||
|
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs")
|
||||||
|
VECTOR_SIZE = 768
|
||||||
|
RAW_DATA_FILE = Path("../data/raw/odoo18_docs_raw.jsonl")
|
||||||
|
BATCH_SIZE = 32
|
||||||
|
CHUNK_SIZE = 512
|
||||||
|
CHUNK_OVERLAP = 64
|
||||||
|
UPSERT_BATCH = 100
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Chunk:
|
||||||
|
chunk_id: str
|
||||||
|
doc_id: str
|
||||||
|
url: str
|
||||||
|
title: str
|
||||||
|
module: str
|
||||||
|
section: str
|
||||||
|
headings: list
|
||||||
|
text: str
|
||||||
|
chunk_index: int
|
||||||
|
|
||||||
|
|
||||||
|
def split_text(text: str) -> list:
|
||||||
|
target_words = int(CHUNK_SIZE * 0.75)
|
||||||
|
overlap_words = int(CHUNK_OVERLAP * 0.75)
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
current = []
|
||||||
|
for word in text.split():
|
||||||
|
current.append(word)
|
||||||
|
if word.endswith((".", "?", "!", ":\n", "\n\n")):
|
||||||
|
sentences.append(" ".join(current))
|
||||||
|
current = []
|
||||||
|
if current:
|
||||||
|
sentences.append(" ".join(current))
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
buffer_words = []
|
||||||
|
buffer_count = 0
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
s_words = sentence.split()
|
||||||
|
s_count = len(s_words)
|
||||||
|
if buffer_count + s_count > target_words and buffer_words:
|
||||||
|
chunks.append(" ".join(buffer_words))
|
||||||
|
overlap_slice = buffer_words[-overlap_words:] if overlap_words else []
|
||||||
|
buffer_words = overlap_slice + s_words
|
||||||
|
buffer_count = len(buffer_words)
|
||||||
|
else:
|
||||||
|
buffer_words.extend(s_words)
|
||||||
|
buffer_count += s_count
|
||||||
|
|
||||||
|
if buffer_words:
|
||||||
|
chunks.append(" ".join(buffer_words))
|
||||||
|
|
||||||
|
return [c for c in chunks if len(c.strip()) > 80]
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_page(page: dict) -> list:
|
||||||
|
text_chunks = split_text(page["text"])
|
||||||
|
if not text_chunks:
|
||||||
|
return []
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
for idx, text in enumerate(text_chunks):
|
||||||
|
chunk_id = hashlib.sha256(f"{page['doc_id']}_{idx}".encode()).hexdigest()[:20]
|
||||||
|
chunks.append(Chunk(
|
||||||
|
chunk_id=chunk_id,
|
||||||
|
doc_id=page["doc_id"],
|
||||||
|
url=page["url"],
|
||||||
|
title=page["title"],
|
||||||
|
module=page.get("module", "general"),
|
||||||
|
section=page.get("section", ""),
|
||||||
|
headings=page.get("headings", []),
|
||||||
|
text=text,
|
||||||
|
chunk_index=idx,
|
||||||
|
))
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def embed_batch(texts: list) -> list:
|
||||||
|
resp = requests.post(
|
||||||
|
f"{OLLAMA_URL}/api/embed",
|
||||||
|
json={"model": EMBED_MODEL, "input": texts},
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
embeddings = resp.json().get("embeddings", [])
|
||||||
|
if len(embeddings) != len(texts):
|
||||||
|
raise ValueError(f"Expected {len(texts)} embeddings, got {len(embeddings)}")
|
||||||
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def check_ollama() -> bool:
|
||||||
|
try:
|
||||||
|
resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
|
||||||
|
models = [m["name"] for m in resp.json().get("models", [])]
|
||||||
|
if not any(EMBED_MODEL in m for m in models):
|
||||||
|
log.error(f"Model '{EMBED_MODEL}' not found. Run: ollama pull {EMBED_MODEL}")
|
||||||
|
return False
|
||||||
|
log.info(f"Ollama OK at {OLLAMA_URL} — model {EMBED_MODEL} ready")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Ollama unreachable at {OLLAMA_URL}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def setup_collection(client: QdrantClient, reset: bool = False):
|
||||||
|
exists = client.collection_exists(COLLECTION_NAME)
|
||||||
|
|
||||||
|
if exists and reset:
|
||||||
|
log.info(f"Dropping collection '{COLLECTION_NAME}'...")
|
||||||
|
client.delete_collection(COLLECTION_NAME)
|
||||||
|
exists = False
|
||||||
|
|
||||||
|
if not exists:
|
||||||
|
log.info(f"Creating collection '{COLLECTION_NAME}' (dim={VECTOR_SIZE})...")
|
||||||
|
client.create_collection(
|
||||||
|
collection_name=COLLECTION_NAME,
|
||||||
|
vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
|
||||||
|
)
|
||||||
|
client.create_payload_index(COLLECTION_NAME, field_name="module", field_schema="keyword")
|
||||||
|
client.create_payload_index(COLLECTION_NAME, field_name="url", field_schema="keyword")
|
||||||
|
else:
|
||||||
|
info = client.get_collection(COLLECTION_NAME)
|
||||||
|
log.info(f"Collection '{COLLECTION_NAME}' exists ({info.points_count} points)")
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_chunks(client: QdrantClient, chunks: list, vectors: list):
|
||||||
|
points = []
|
||||||
|
for chunk, vector in zip(chunks, vectors):
|
||||||
|
points.append(PointStruct(
|
||||||
|
id=int(chunk.chunk_id[:8], 16),
|
||||||
|
vector=vector,
|
||||||
|
payload={
|
||||||
|
"chunk_id": chunk.chunk_id,
|
||||||
|
"doc_id": chunk.doc_id,
|
||||||
|
"url": chunk.url,
|
||||||
|
"title": chunk.title,
|
||||||
|
"module": chunk.module,
|
||||||
|
"section": chunk.section,
|
||||||
|
"headings": chunk.headings,
|
||||||
|
"text": chunk.text,
|
||||||
|
"chunk_index": chunk.chunk_index,
|
||||||
|
},
|
||||||
|
))
|
||||||
|
for i in range(0, len(points), UPSERT_BATCH):
|
||||||
|
client.upsert(collection_name=COLLECTION_NAME, points=points[i:i + UPSERT_BATCH])
|
||||||
|
|
||||||
|
|
||||||
|
def index(module_filter: str | None = None, reset: bool = False):
|
||||||
|
if not check_ollama():
|
||||||
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
if not RAW_DATA_FILE.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Raw data not found: {RAW_DATA_FILE}\n"
|
||||||
|
f"Run the scraper first: docker compose run --rm scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
client = QdrantClient(url=QDRANT_URL)
|
||||||
|
setup_collection(client, reset=reset)
|
||||||
|
|
||||||
|
pages = []
|
||||||
|
with open(RAW_DATA_FILE, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
page = json.loads(line.strip())
|
||||||
|
if module_filter and page.get("module") != module_filter:
|
||||||
|
continue
|
||||||
|
pages.append(page)
|
||||||
|
log.info(f"Loaded {len(pages)} pages")
|
||||||
|
|
||||||
|
all_chunks = []
|
||||||
|
for page in pages:
|
||||||
|
all_chunks.extend(chunk_page(page))
|
||||||
|
log.info(f"Created {len(all_chunks)} chunks")
|
||||||
|
|
||||||
|
total = len(all_chunks)
|
||||||
|
embedded = failed = 0
|
||||||
|
|
||||||
|
for i in range(0, total, BATCH_SIZE):
|
||||||
|
batch = all_chunks[i:i + BATCH_SIZE]
|
||||||
|
try:
|
||||||
|
vectors = embed_batch([c.text for c in batch])
|
||||||
|
upsert_chunks(client, batch, vectors)
|
||||||
|
embedded += len(batch)
|
||||||
|
log.info(f"Progress: {embedded}/{total} ({embedded/total*100:.0f}%)")
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Batch {i//BATCH_SIZE} failed: {e}")
|
||||||
|
failed += len(batch)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
info = client.get_collection(COLLECTION_NAME)
|
||||||
|
log.info(
|
||||||
|
f"\n✅ Done. Embedded: {embedded}, Failed: {failed}\n"
|
||||||
|
f" Total vectors in Qdrant: {info.points_count}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Odoo 18 RAG indexer")
|
||||||
|
parser.add_argument("--module", help="Index only one module")
|
||||||
|
parser.add_argument("--reset", action="store_true", help="Drop and recreate collection")
|
||||||
|
args = parser.parse_args()
|
||||||
|
index(module_filter=args.module, reset=args.reset)
|
||||||
9
requirements.txt
Normal file
9
requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
fastapi==0.111.0
|
||||||
|
uvicorn[standard]==0.29.0
|
||||||
|
httpx==0.27.0
|
||||||
|
pydantic==2.7.0
|
||||||
|
qdrant-client==1.9.0
|
||||||
|
requests==2.31.0
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
lxml==5.2.1
|
||||||
|
python-dotenv==1.0.1
|
||||||
316
scraper/scraper.py
Normal file
316
scraper/scraper.py
Normal file
@@ -0,0 +1,316 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Odoo 18 Documentation Scraper
|
||||||
|
==============================
|
||||||
|
Crawls the Odoo 18 docs sitemap, extracts clean text from each page,
|
||||||
|
and saves structured JSON ready for the indexer.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scraper.py # full crawl
|
||||||
|
python scraper.py --module accounting # single module
|
||||||
|
python scraper.py --limit 50 # test run
|
||||||
|
|
||||||
|
Output: ../data/raw/odoo18_docs_raw.jsonl
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BASE_URL = "https://www.odoo.com/documentation/18.0"
|
||||||
|
SITEMAP_URL = f"{BASE_URL}/sitemap.xml"
|
||||||
|
OUTPUT_DIR = Path("../data/raw")
|
||||||
|
OUTPUT_FILE = OUTPUT_DIR / "odoo18_docs_raw.jsonl"
|
||||||
|
DELAY_SECONDS = 1.2
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
REQUEST_TIMEOUT = 20
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (compatible; ActiveBlue-RAG-Indexer/1.0; "
|
||||||
|
"+https://activeblue.net)"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
MODULE_PATHS = {
|
||||||
|
"accounting": "/applications/finance/accounting",
|
||||||
|
"invoicing": "/applications/finance",
|
||||||
|
"inventory": "/applications/inventory_and_mrp/inventory",
|
||||||
|
"purchase": "/applications/inventory_and_mrp/purchase",
|
||||||
|
"manufacturing": "/applications/inventory_and_mrp/manufacturing",
|
||||||
|
"sales": "/applications/sales/sales",
|
||||||
|
"crm": "/applications/sales/crm",
|
||||||
|
"employees": "/applications/hr/employees",
|
||||||
|
"payroll": "/applications/hr/payroll",
|
||||||
|
"timesheets": "/applications/services/timesheets",
|
||||||
|
"project": "/applications/services/project",
|
||||||
|
"helpdesk": "/applications/services/helpdesk",
|
||||||
|
"ecommerce": "/applications/websites/ecommerce",
|
||||||
|
"website": "/applications/websites/website",
|
||||||
|
"marketing": "/applications/marketing",
|
||||||
|
"pos": "/applications/sales/point_of_sale",
|
||||||
|
"quality": "/applications/inventory_and_mrp/quality",
|
||||||
|
"maintenance": "/applications/inventory_and_mrp/maintenance",
|
||||||
|
"fleet": "/applications/hr/fleet",
|
||||||
|
"discuss": "/applications/productivity/discuss",
|
||||||
|
"studio": "/applications/studio",
|
||||||
|
"general": "/applications/general",
|
||||||
|
"install": "/administration",
|
||||||
|
}
|
||||||
|
|
||||||
|
NOISE_SELECTORS = [
|
||||||
|
"nav", "footer", "header", ".toctree-wrapper",
|
||||||
|
".wy-nav-side", ".wy-menu", ".wy-side-nav-search",
|
||||||
|
".rst-footer-buttons", "#edit-on-github",
|
||||||
|
"[role='navigation']", ".breadcrumbs",
|
||||||
|
".sidebar", ".sphinxsidebar",
|
||||||
|
"script", "style",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DocPage:
|
||||||
|
url: str
|
||||||
|
title: str
|
||||||
|
module: str
|
||||||
|
section: str
|
||||||
|
text: str
|
||||||
|
headings: list
|
||||||
|
doc_id: str
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list:
|
||||||
|
log.info(f"Fetching sitemap: {sitemap_url}")
|
||||||
|
resp = requests.get(sitemap_url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
soup = BeautifulSoup(resp.text, "xml")
|
||||||
|
all_urls = [loc.text.strip() for loc in soup.find_all("loc")]
|
||||||
|
urls = [
|
||||||
|
u for u in all_urls
|
||||||
|
if "/18.0/" in u or "/documentation/18.0" in u
|
||||||
|
if not any(f"/{lang}/" in u for lang in ["fr", "de", "es", "pt", "nl", "zh"])
|
||||||
|
]
|
||||||
|
if module_filter:
|
||||||
|
path = MODULE_PATHS.get(module_filter)
|
||||||
|
if not path:
|
||||||
|
raise ValueError(f"Unknown module '{module_filter}'. Choose from: {', '.join(MODULE_PATHS)}")
|
||||||
|
urls = [u for u in urls if path in u]
|
||||||
|
log.info(f"Module filter '{module_filter}': {len(urls)} pages")
|
||||||
|
else:
|
||||||
|
log.info(f"Total pages: {len(urls)}")
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def fallback_urls() -> list:
|
||||||
|
"""Curated fallback list if sitemap is unavailable."""
|
||||||
|
paths = [
|
||||||
|
"/applications/finance/accounting.html",
|
||||||
|
"/applications/finance/accounting/customer_invoices.html",
|
||||||
|
"/applications/finance/accounting/customer_invoices/overview.html",
|
||||||
|
"/applications/finance/accounting/vendor_bills.html",
|
||||||
|
"/applications/finance/accounting/get_started/chart_of_accounts.html",
|
||||||
|
"/applications/finance/accounting/get_started/cheat_sheet.html",
|
||||||
|
"/applications/finance/accounting/get_started/multi_currency.html",
|
||||||
|
"/applications/finance/accounting/reporting/budget.html",
|
||||||
|
"/applications/finance/accounting/reporting/analytic_accounting.html",
|
||||||
|
"/applications/finance/accounting/bank.html",
|
||||||
|
"/applications/finance/accounting/taxes.html",
|
||||||
|
"/applications/finance/accounting/reporting.html",
|
||||||
|
"/applications/finance/expenses.html",
|
||||||
|
"/applications/finance/expenses/reinvoice_expenses.html",
|
||||||
|
"/applications/finance/payment_providers.html",
|
||||||
|
"/applications/finance.html",
|
||||||
|
"/applications/sales.html",
|
||||||
|
"/applications/sales/sales.html",
|
||||||
|
"/applications/sales/crm.html",
|
||||||
|
"/applications/sales/crm/pipeline.html",
|
||||||
|
"/applications/sales/crm/acquire_leads/email_manual.html",
|
||||||
|
"/applications/sales/crm/pipeline/manage_sales_teams.html",
|
||||||
|
"/applications/sales/crm/optimize/utilize_activities.html",
|
||||||
|
"/applications/inventory_and_mrp/inventory.html",
|
||||||
|
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html",
|
||||||
|
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html",
|
||||||
|
"/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html",
|
||||||
|
"/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html",
|
||||||
|
"/applications/inventory_and_mrp/purchase.html",
|
||||||
|
"/applications/inventory_and_mrp/purchase/manage_deals/rfq.html",
|
||||||
|
"/applications/inventory_and_mrp/purchase/manage_deals/manage.html",
|
||||||
|
"/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html",
|
||||||
|
"/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html",
|
||||||
|
"/applications/inventory_and_mrp/manufacturing.html",
|
||||||
|
"/applications/inventory_and_mrp/manufacturing/workflows.html",
|
||||||
|
"/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html",
|
||||||
|
"/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html",
|
||||||
|
"/applications/inventory_and_mrp/manufacturing/subcontracting.html",
|
||||||
|
"/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html",
|
||||||
|
"/applications/hr.html",
|
||||||
|
"/applications/hr/employees.html",
|
||||||
|
"/applications/hr/employees/new_employee.html",
|
||||||
|
"/applications/hr/payroll.html",
|
||||||
|
"/applications/hr/payroll/contracts.html",
|
||||||
|
"/applications/hr/payroll/payslips.html",
|
||||||
|
"/applications/hr/payroll/batches.html",
|
||||||
|
"/applications/websites/ecommerce.html",
|
||||||
|
"/applications/websites/ecommerce/products.html",
|
||||||
|
"/applications/websites/ecommerce/checkout_payment_shipping/checkout.html",
|
||||||
|
"/applications/websites/ecommerce/checkout_payment_shipping/payments.html",
|
||||||
|
"/applications/websites/ecommerce/customer_accounts.html",
|
||||||
|
"/applications/services/helpdesk.html",
|
||||||
|
"/applications/services/helpdesk/advanced/after_sales.html",
|
||||||
|
"/applications/services/project.html",
|
||||||
|
"/applications/finance/fiscal_localizations/united_states.html",
|
||||||
|
"/applications.html",
|
||||||
|
"/applications/general.html",
|
||||||
|
]
|
||||||
|
return [urljoin(BASE_URL, p) for p in paths]
|
||||||
|
|
||||||
|
|
||||||
|
def infer_module(url: str) -> str:
|
||||||
|
for module, path in MODULE_PATHS.items():
|
||||||
|
if path.lstrip("/") in url:
|
||||||
|
return module
|
||||||
|
return "general"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_section(soup: BeautifulSoup) -> str:
|
||||||
|
bc = soup.select(".breadcrumbs a, .wy-breadcrumbs a, nav[aria-label='breadcrumb'] a")
|
||||||
|
if bc:
|
||||||
|
return " > ".join(a.get_text(strip=True) for a in bc if a.get_text(strip=True))
|
||||||
|
h1 = soup.find("h1")
|
||||||
|
return h1.get_text(strip=True) if h1 else "Odoo 18 Docs"
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(soup: BeautifulSoup) -> tuple:
|
||||||
|
for sel in NOISE_SELECTORS:
|
||||||
|
for el in soup.select(sel):
|
||||||
|
el.decompose()
|
||||||
|
|
||||||
|
content = (
|
||||||
|
soup.find("div", {"class": "document"})
|
||||||
|
or soup.find("article")
|
||||||
|
or soup.find("main")
|
||||||
|
or soup.find("div", {"role": "main"})
|
||||||
|
or soup.find("body")
|
||||||
|
)
|
||||||
|
if not content:
|
||||||
|
return "", []
|
||||||
|
|
||||||
|
headings = []
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
for el in content.descendants:
|
||||||
|
if not hasattr(el, "name"):
|
||||||
|
continue
|
||||||
|
if el.name in ("h1", "h2", "h3", "h4"):
|
||||||
|
text = el.get_text(strip=True)
|
||||||
|
if text:
|
||||||
|
prefix = "#" * int(el.name[1])
|
||||||
|
lines.append(f"\n{prefix} {text}\n")
|
||||||
|
if el.name in ("h2", "h3"):
|
||||||
|
headings.append(text)
|
||||||
|
elif el.name == "p":
|
||||||
|
text = el.get_text(separator=" ", strip=True)
|
||||||
|
if text and len(text) > 20:
|
||||||
|
lines.append(text)
|
||||||
|
elif el.name == "li":
|
||||||
|
text = el.get_text(separator=" ", strip=True)
|
||||||
|
if text and len(text) > 5:
|
||||||
|
lines.append(f"- {text}")
|
||||||
|
elif el.name == "code":
|
||||||
|
text = el.get_text(strip=True)
|
||||||
|
if text:
|
||||||
|
lines.append(f"`{text}`")
|
||||||
|
|
||||||
|
raw = "\n".join(lines)
|
||||||
|
clean = re.sub(r"\n{3,}", "\n\n", raw).strip()
|
||||||
|
return clean, headings
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(url: str) -> DocPage | None:
|
||||||
|
for attempt in range(MAX_RETRIES):
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||||||
|
if resp.status_code == 404:
|
||||||
|
log.warning(f"404: {url}")
|
||||||
|
return None
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
title = title_tag.get_text(strip=True) if title_tag else url
|
||||||
|
title = re.sub(r"\s*—\s*Odoo.*", "", title).strip()
|
||||||
|
|
||||||
|
text, headings = clean_text(soup)
|
||||||
|
if len(text) < 100:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return DocPage(
|
||||||
|
url=url,
|
||||||
|
title=title,
|
||||||
|
module=infer_module(url),
|
||||||
|
section=extract_section(soup),
|
||||||
|
text=text,
|
||||||
|
headings=headings,
|
||||||
|
doc_id=hashlib.sha256(url.encode()).hexdigest()[:16],
|
||||||
|
)
|
||||||
|
except requests.RequestException as e:
|
||||||
|
if attempt < MAX_RETRIES - 1:
|
||||||
|
wait = 2 ** attempt
|
||||||
|
log.warning(f"Retry {attempt+1} for {url}: {e} (wait {wait}s)")
|
||||||
|
time.sleep(wait)
|
||||||
|
else:
|
||||||
|
log.error(f"Failed: {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def crawl(module: str | None = None, limit: int | None = None):
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
urls = fetch_sitemap_urls(SITEMAP_URL, module)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"Sitemap unavailable ({e}), using fallback list")
|
||||||
|
urls = fallback_urls()
|
||||||
|
if module:
|
||||||
|
path = MODULE_PATHS.get(module, "")
|
||||||
|
urls = [u for u in urls if path.lstrip("/") in u]
|
||||||
|
|
||||||
|
if limit:
|
||||||
|
urls = urls[:limit]
|
||||||
|
|
||||||
|
log.info(f"Crawling {len(urls)} pages...")
|
||||||
|
written = skipped = 0
|
||||||
|
|
||||||
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||||
|
for i, url in enumerate(urls, 1):
|
||||||
|
log.info(f"[{i}/{len(urls)}] {url}")
|
||||||
|
page = fetch_page(url)
|
||||||
|
if page:
|
||||||
|
f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n")
|
||||||
|
written += 1
|
||||||
|
else:
|
||||||
|
skipped += 1
|
||||||
|
time.sleep(DELAY_SECONDS)
|
||||||
|
|
||||||
|
log.info(f"\n✅ Done. Written: {written}, Skipped: {skipped}")
|
||||||
|
log.info(f" Output: {OUTPUT_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Odoo 18 docs scraper")
|
||||||
|
parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}")
|
||||||
|
parser.add_argument("--limit", type=int, help="Max pages (for testing)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
crawl(module=args.module, limit=args.limit)
|
||||||
Reference in New Issue
Block a user