Revert Phase 1 STT/auth swaps: stay on Whisper + Twilio Auth Token

Deepgram and the Twilio Standard API Key were reverted per decision: - bot.py: restore HintedWhisperSTTService (faster-whisper hotwords), default model medium; remove DeepgramSTTService import + DEEPGRAM_API_KEY. - server.py: restore TWILIO_AUTH_TOKEN for X-Twilio-Signature validation and the serializer auto-hang-up. Twilio signs webhooks with the Auth Token, so an API Key Secret cannot validate signatures. - .env.example: back to TWILIO_AUTH_TOKEN + Whisper STT vars. - .gitignore: ignore runtime *.log (avc_run.log). OLLAMA_MODEL stays activeblue-avc:latest (the existing pulled tag). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-25 01:06:24 +00:00
parent 004ef3bdc0
commit 5ed641255c
4 changed files with 74 additions and 51 deletions
--- a/bot.py
+++ b/bot.py
@@ -40,9 +40,9 @@ from pipecat.processors.audio.vad_processor import VADProcessor
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.serializers.twilio import TwilioFrameSerializer
 from pipecat.services.anthropic.llm import AnthropicLLMService
-from pipecat.services.deepgram.stt import DeepgramSTTService
 from pipecat.services.kokoro.tts import KokoroTTSService
 from pipecat.services.ollama.llm import OLLamaLLMService
+from pipecat.services.whisper.stt import WhisperSTTService
 from pipecat.transports.websocket.fastapi import (
    FastAPIWebsocketParams,
    FastAPIWebsocketTransport,
@@ -76,14 +76,21 @@ ENABLE_TOOLS = (
 LLM_TEMPERATURE = float(os.environ.get("LLM_TEMPERATURE", "0.3"))
 LLM_MAX_TOKENS = int(os.environ.get("LLM_MAX_TOKENS", "160"))
 KOKORO_VOICE = os.environ.get("KOKORO_VOICE", "af_heart")
-# Real-time STT is Deepgram Nova-2: end-of-utterance events in <300ms (vs Whisper's
-# 1-3s of chunk buffering, the main cause of non-reply / repeat-yourself). Whisper
-# large-v3 is retained for post-call transcription only (Phase 3).
-DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY", "")
+WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "medium")   # tiny|base|small|medium
+WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cuda")   # cuda for the 5080
+WHISPER_COMPUTE = os.environ.get("WHISPER_COMPUTE", "float16")
+# Bias transcription toward our domain vocabulary (office cities + optometry terms) so
+# 8 kHz telephony audio doesn't turn "Hialeah" into "high allele" or "eye exam" into "hire".
+WHISPER_HOTWORDS = os.environ.get(
+    "WHISPER_HOTWORDS",
+    "Advanced Vision Care, eye exam, annual exam, appointment, optometry, contact lens, "
+    "Hialeah, Kendall, Tamarac, Pembroke Pines, Lauderdale Lakes, Miami Gardens, Boca Raton",
+)

-# Twilio sends 8 kHz mu-law on the wire — we run the PIPELINE at 16 kHz and let
-# TwilioFrameSerializer resample to/from the 8 kHz wire. (Silero VAD, Deepgram, and
-# Kokoro are all happy at 16 kHz.)
+# Twilio sends 8 kHz mu-law on the wire, but faster-whisper assumes any numpy array is
+# 16 kHz — so we run the PIPELINE at 16 kHz and let TwilioFrameSerializer resample to/from
+# the 8 kHz wire. Running the pipeline at 8 kHz makes Whisper hear 2x-speed audio and
+# transcribe nothing. (Silero VAD + Kokoro are happy at 16 kHz too.)
 WIRE_SAMPLE_RATE = 8000          # Twilio mu-law on the wire (serializer handles this)
 PIPELINE_SAMPLE_RATE = 16000     # internal rate Whisper/VAD actually need

@@ -215,6 +222,35 @@ class AudioHeartbeat(FrameProcessor):
        await self.push_frame(frame, direction)


+class HintedWhisperSTTService(WhisperSTTService):
+    """WhisperSTTService that biases transcription toward domain vocabulary via
+    faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap
+    the model's transcribe() for the duration of each call. Each call gets its own
+    Whisper instance, so this per-instance patch is race-free."""
+
+    def __init__(self, *args, hotwords: str | None = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._hotwords = hotwords
+
+    async def run_stt(self, audio):
+        if self._hotwords and self._model is not None:
+            real = self._model.transcribe
+
+            def patched(audio_arg, **kw):
+                kw.setdefault("hotwords", self._hotwords)
+                return real(audio_arg, **kw)
+
+            self._model.transcribe = patched
+            try:
+                async for frame in super().run_stt(audio):
+                    yield frame
+            finally:
+                self._model.transcribe = real
+        else:
+            async for frame in super().run_stt(audio):
+                yield frame
+
+
 def build_llm_service():
    """Build the LLM service for the selected provider. The universal LLMContext +
    aggregators work with either, so only this construction differs (true A/B swap)."""
@@ -250,16 +286,11 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
    (Twilio Media Stream) and the browser path (WebRTC) — same prompt, model, voice, and
    booking/hang-up logic; only the transport differs. do_capture writes the post-call
    appointment to Odoo (on for phone; off for browser testing so it doesn't make cards)."""
-    stt = DeepgramSTTService(
-        api_key=DEEPGRAM_API_KEY,
-        settings=DeepgramSTTService.Settings(
-            model="nova-2",
-            language="en-US",
-            smart_format=True,
-            punctuate=True,
-            interim_results=False,      # final transcripts only — avoids double-firing
-            utterance_end_ms=1000,      # ms of silence before end-of-utterance fires
-        ),
+    stt = HintedWhisperSTTService(
+        settings=WhisperSTTService.Settings(model=WHISPER_MODEL),
+        device=WHISPER_DEVICE,
+        compute_type=WHISPER_COMPUTE,
+        hotwords=WHISPER_HOTWORDS,
    )
    llm = build_llm_service()
    # In-call booking tool — only registered when ENABLE_TOOLS is on (auto: Claude yes,