From 5ed641255c1003044f7205ded533aa87dc8c746b Mon Sep 17 00:00:00 2001 From: tocmo0nlord Date: Thu, 25 Jun 2026 01:06:24 +0000 Subject: [PATCH] Revert Phase 1 STT/auth swaps: stay on Whisper + Twilio Auth Token Deepgram and the Twilio Standard API Key were reverted per decision: - bot.py: restore HintedWhisperSTTService (faster-whisper hotwords), default model medium; remove DeepgramSTTService import + DEEPGRAM_API_KEY. - server.py: restore TWILIO_AUTH_TOKEN for X-Twilio-Signature validation and the serializer auto-hang-up. Twilio signs webhooks with the Auth Token, so an API Key Secret cannot validate signatures. - .env.example: back to TWILIO_AUTH_TOKEN + Whisper STT vars. - .gitignore: ignore runtime *.log (avc_run.log). OLLAMA_MODEL stays activeblue-avc:latest (the existing pulled tag). Co-Authored-By: Claude Opus 4.8 --- .env.example | 20 +++++----------- .gitignore | 4 ++++ bot.py | 67 ++++++++++++++++++++++++++++++++++++++-------------- server.py | 34 ++++++++++++-------------- 4 files changed, 74 insertions(+), 51 deletions(-) diff --git a/.env.example b/.env.example index 40c20de..63a7db3 100644 --- a/.env.example +++ b/.env.example @@ -9,15 +9,12 @@ PORT=8200 BIND_HOST=127.0.0.1 # ── Twilio ─────────────────────────────────────────────────────────────────── -# From console.twilio.com. Account SID + a Standard API Key (scoped to this app, -# revocable independently). The Auth Token stays in the Twilio console only — never on -# this server. Create the key under Account → API Keys → Create Standard key, name it -# avc-phone-agent-prod; the Secret is shown once. Used to auto-hang-up the carrier leg -# and validate inbound webhook signatures. +# From console.twilio.com. Used to auto-hang-up the carrier leg and (recommended) +# validate inbound webhook signatures. Twilio signs webhooks with the Auth Token, so +# signature validation must use the Auth Token (not an API Key Secret). TWILIO_ACCOUNT_SID=ACxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -TWILIO_API_KEY_SID=SKxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -TWILIO_API_KEY_SECRET=your_api_key_secret_here -# Inbound webhook signature validation is ON whenever TWILIO_API_KEY_SECRET is set. +TWILIO_AUTH_TOKEN=your_auth_token_here +# Inbound webhook signature validation is ON whenever TWILIO_AUTH_TOKEN is set. # Set to false only for local testing without real Twilio requests. TWILIO_VALIDATE=true # Shared secret embedded in the Media Stream wss URL to gate /ws. Set a stable random @@ -49,12 +46,7 @@ ANTHROPIC_API_KEY= # Default is the most capable model; for low-latency phone voice prefer claude-haiku-4-5 # (fastest) or claude-sonnet-4-6 (balance). ANTHROPIC_MODEL=claude-opus-4-8 -# ── STT: Deepgram (real-time, in-call only) ────────────────────────────────── -# Nova-2 delivers end-of-utterance in <300ms (vs Whisper's 1-3s buffering). Key from -# console.deepgram.com. Model is fixed to nova-2 in code; DEEPGRAM_MODEL is informational. -DEEPGRAM_API_KEY= -DEEPGRAM_MODEL=nova-2 -# Whisper is retained for POST-CALL transcription only (Phase 3), not the live pipeline. +# ── STT: Whisper (faster-whisper, real-time in-call) ───────────────────────── WHISPER_MODEL=base WHISPER_DEVICE=cuda WHISPER_COMPUTE=float16 diff --git a/.gitignore b/.gitignore index ff04b07..b5675ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,10 @@ # Secrets — never commit .env +# Runtime logs +avc_run.log +*.log + # Recordings (local only, may contain PHI) recordings/ diff --git a/bot.py b/bot.py index cc2a717..3281fa0 100644 --- a/bot.py +++ b/bot.py @@ -40,9 +40,9 @@ from pipecat.processors.audio.vad_processor import VADProcessor from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.serializers.twilio import TwilioFrameSerializer from pipecat.services.anthropic.llm import AnthropicLLMService -from pipecat.services.deepgram.stt import DeepgramSTTService from pipecat.services.kokoro.tts import KokoroTTSService from pipecat.services.ollama.llm import OLLamaLLMService +from pipecat.services.whisper.stt import WhisperSTTService from pipecat.transports.websocket.fastapi import ( FastAPIWebsocketParams, FastAPIWebsocketTransport, @@ -76,14 +76,21 @@ ENABLE_TOOLS = ( LLM_TEMPERATURE = float(os.environ.get("LLM_TEMPERATURE", "0.3")) LLM_MAX_TOKENS = int(os.environ.get("LLM_MAX_TOKENS", "160")) KOKORO_VOICE = os.environ.get("KOKORO_VOICE", "af_heart") -# Real-time STT is Deepgram Nova-2: end-of-utterance events in <300ms (vs Whisper's -# 1-3s of chunk buffering, the main cause of non-reply / repeat-yourself). Whisper -# large-v3 is retained for post-call transcription only (Phase 3). -DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY", "") +WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "medium") # tiny|base|small|medium +WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cuda") # cuda for the 5080 +WHISPER_COMPUTE = os.environ.get("WHISPER_COMPUTE", "float16") +# Bias transcription toward our domain vocabulary (office cities + optometry terms) so +# 8 kHz telephony audio doesn't turn "Hialeah" into "high allele" or "eye exam" into "hire". +WHISPER_HOTWORDS = os.environ.get( + "WHISPER_HOTWORDS", + "Advanced Vision Care, eye exam, annual exam, appointment, optometry, contact lens, " + "Hialeah, Kendall, Tamarac, Pembroke Pines, Lauderdale Lakes, Miami Gardens, Boca Raton", +) -# Twilio sends 8 kHz mu-law on the wire — we run the PIPELINE at 16 kHz and let -# TwilioFrameSerializer resample to/from the 8 kHz wire. (Silero VAD, Deepgram, and -# Kokoro are all happy at 16 kHz.) +# Twilio sends 8 kHz mu-law on the wire, but faster-whisper assumes any numpy array is +# 16 kHz — so we run the PIPELINE at 16 kHz and let TwilioFrameSerializer resample to/from +# the 8 kHz wire. Running the pipeline at 8 kHz makes Whisper hear 2x-speed audio and +# transcribe nothing. (Silero VAD + Kokoro are happy at 16 kHz too.) WIRE_SAMPLE_RATE = 8000 # Twilio mu-law on the wire (serializer handles this) PIPELINE_SAMPLE_RATE = 16000 # internal rate Whisper/VAD actually need @@ -215,6 +222,35 @@ class AudioHeartbeat(FrameProcessor): await self.push_frame(frame, direction) +class HintedWhisperSTTService(WhisperSTTService): + """WhisperSTTService that biases transcription toward domain vocabulary via + faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap + the model's transcribe() for the duration of each call. Each call gets its own + Whisper instance, so this per-instance patch is race-free.""" + + def __init__(self, *args, hotwords: str | None = None, **kwargs): + super().__init__(*args, **kwargs) + self._hotwords = hotwords + + async def run_stt(self, audio): + if self._hotwords and self._model is not None: + real = self._model.transcribe + + def patched(audio_arg, **kw): + kw.setdefault("hotwords", self._hotwords) + return real(audio_arg, **kw) + + self._model.transcribe = patched + try: + async for frame in super().run_stt(audio): + yield frame + finally: + self._model.transcribe = real + else: + async for frame in super().run_stt(audio): + yield frame + + def build_llm_service(): """Build the LLM service for the selected provider. The universal LLMContext + aggregators work with either, so only this construction differs (true A/B swap).""" @@ -250,16 +286,11 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru (Twilio Media Stream) and the browser path (WebRTC) — same prompt, model, voice, and booking/hang-up logic; only the transport differs. do_capture writes the post-call appointment to Odoo (on for phone; off for browser testing so it doesn't make cards).""" - stt = DeepgramSTTService( - api_key=DEEPGRAM_API_KEY, - settings=DeepgramSTTService.Settings( - model="nova-2", - language="en-US", - smart_format=True, - punctuate=True, - interim_results=False, # final transcripts only — avoids double-firing - utterance_end_ms=1000, # ms of silence before end-of-utterance fires - ), + stt = HintedWhisperSTTService( + settings=WhisperSTTService.Settings(model=WHISPER_MODEL), + device=WHISPER_DEVICE, + compute_type=WHISPER_COMPUTE, + hotwords=WHISPER_HOTWORDS, ) llm = build_llm_service() # In-call booking tool — only registered when ENABLE_TOOLS is on (auto: Claude yes, diff --git a/server.py b/server.py index db3c5a8..407706b 100644 --- a/server.py +++ b/server.py @@ -10,8 +10,8 @@ Two endpoints, both reached by Twilio over your public Traefik domain: Security: - POST /voice is authenticated with Twilio's X-Twilio-Signature (HMAC-SHA1 over the - public URL + sorted POST params, keyed by the API Key Secret). Enforced whenever - TWILIO_API_KEY_SECRET is set; set TWILIO_VALIDATE=false to bypass for local testing. + public URL + sorted POST params, keyed by the auth token). Enforced whenever + TWILIO_AUTH_TOKEN is set; set TWILIO_VALIDATE=false to bypass for local testing. - WS /ws can't carry an X-Twilio-Signature usefully, so we gate it with a shared STREAM_TOKEN embedded in the wss URL we hand Twilio in the TwiML. @@ -44,20 +44,16 @@ BIND_HOST = os.environ.get("BIND_HOST", "127.0.0.1") # Twilio REST creds — let the serializer auto-hang-up the carrier leg on EndFrame, # and validate inbound webhook signatures. TWILIO_ACCOUNT_SID = os.environ.get("TWILIO_ACCOUNT_SID") -# Standard API Key (scoped to this app, revocable independently) instead of the account -# master Auth Token. The Secret is used both for HMAC webhook-signature validation and as -# the serializer credential for auto-hang-up. -TWILIO_API_KEY_SID = os.environ.get("TWILIO_API_KEY_SID") -TWILIO_API_KEY_SECRET = os.environ.get("TWILIO_API_KEY_SECRET") -# Signature validation is ON by default when the API key secret exists; explicit opt-out. +TWILIO_AUTH_TOKEN = os.environ.get("TWILIO_AUTH_TOKEN") +# Signature validation is ON by default when an auth token exists; explicit opt-out. TWILIO_VALIDATE = os.environ.get("TWILIO_VALIDATE", "true").lower() not in ("false", "0", "no") # Shared secret embedded in the Media Stream wss URL to gate /ws. Auto-generated if # unset (fine for a single process), but set it in .env for stability across restarts. STREAM_TOKEN = os.environ.get("STREAM_TOKEN") or secrets.token_urlsafe(24) -# Max simultaneous live calls. Each call holds an Ollama context on the 16GB GPU and -# Ollama serializes generation, so cap this to protect call quality. +# Max simultaneous live calls. Each call loads a Whisper model + an Ollama context on +# the 16GB GPU and Ollama serializes generation, so cap this to protect call quality. # Over-cap callers hear BUSY_MESSAGE and are hung up — existing calls are never degraded. MAX_CONCURRENT_CALLS = int(os.environ.get("MAX_CONCURRENT_CALLS", "2")) BUSY_MESSAGE = os.environ.get( @@ -93,12 +89,12 @@ def _twilio_signature_ok(url: str, params: dict, header_sig: str) -> bool: """Recompute Twilio's request signature and compare in constant time. Algorithm (Twilio docs): take the full public URL, append each POST param as - key+value sorted by key, HMAC-SHA1 with the API Key Secret, base64-encode. + key+value sorted by key, HMAC-SHA1 with the auth token, base64-encode. """ - if not (TWILIO_API_KEY_SECRET and header_sig): + if not (TWILIO_AUTH_TOKEN and header_sig): return False payload = url + "".join(f"{k}{params[k]}" for k in sorted(params)) - digest = hmac.new(TWILIO_API_KEY_SECRET.encode(), payload.encode("utf-8"), hashlib.sha1).digest() + digest = hmac.new(TWILIO_AUTH_TOKEN.encode(), payload.encode("utf-8"), hashlib.sha1).digest() expected = base64.b64encode(digest).decode() return hmac.compare_digest(expected, header_sig) @@ -108,7 +104,7 @@ async def health(): return { "status": "ok", "public_host": PUBLIC_HOST, - "validate": TWILIO_VALIDATE and bool(TWILIO_API_KEY_SECRET), + "validate": TWILIO_VALIDATE and bool(TWILIO_AUTH_TOKEN), "active_calls": _active_calls, "max_calls": MAX_CONCURRENT_CALLS, } @@ -118,15 +114,15 @@ async def health(): async def voice(request: Request): """TwiML: connect the call to our Media Stream WebSocket (bidirectional).""" form = dict(await request.form()) - if TWILIO_VALIDATE and TWILIO_API_KEY_SECRET: + if TWILIO_VALIDATE and TWILIO_AUTH_TOKEN: # Validate against the PUBLIC url Twilio actually signed, not the internal one. public_url = f"https://{PUBLIC_HOST}/voice" sig = request.headers.get("X-Twilio-Signature", "") if not _twilio_signature_ok(public_url, form, sig): logger.warning("Rejected /voice: bad or missing X-Twilio-Signature") return HTMLResponse(status_code=403, content="forbidden") - elif not TWILIO_API_KEY_SECRET: - logger.warning("/voice signature validation DISABLED (no TWILIO_API_KEY_SECRET set)") + elif not TWILIO_AUTH_TOKEN: + logger.warning("/voice signature validation DISABLED (no TWILIO_AUTH_TOKEN set)") caller = form.get("From", "") # caller-ID; passed through for appointment callback @@ -199,7 +195,7 @@ async def media_stream(websocket: WebSocket): stream_sid=stream_sid, call_sid=call_sid, account_sid=TWILIO_ACCOUNT_SID, - auth_token=TWILIO_API_KEY_SECRET, + auth_token=TWILIO_AUTH_TOKEN, ) await run_call(websocket, serializer, caller_number=caller_number, call_sid=call_sid) except Exception: @@ -214,5 +210,5 @@ if __name__ == "__main__": import uvicorn logger.info(f"AVC phone agent on {BIND_HOST}:{PORT} | public={PUBLIC_HOST} | " - f"sig_validation={'on' if (TWILIO_VALIDATE and TWILIO_API_KEY_SECRET) else 'OFF'}") + f"sig_validation={'on' if (TWILIO_VALIDATE and TWILIO_AUTH_TOKEN) else 'OFF'}") uvicorn.run(app, host=BIND_HOST, port=PORT)