Revert Phase 1 STT/auth swaps: stay on Whisper + Twilio Auth Token

Deepgram and the Twilio Standard API Key were reverted per decision:
- bot.py: restore HintedWhisperSTTService (faster-whisper hotwords), default
  model medium; remove DeepgramSTTService import + DEEPGRAM_API_KEY.
- server.py: restore TWILIO_AUTH_TOKEN for X-Twilio-Signature validation and
  the serializer auto-hang-up. Twilio signs webhooks with the Auth Token, so
  an API Key Secret cannot validate signatures.
- .env.example: back to TWILIO_AUTH_TOKEN + Whisper STT vars.
- .gitignore: ignore runtime *.log (avc_run.log).

OLLAMA_MODEL stays activeblue-avc:latest (the existing pulled tag).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
tocmo0nlord
2026-06-25 01:06:24 +00:00
parent 004ef3bdc0
commit 5ed641255c
4 changed files with 74 additions and 51 deletions

67
bot.py
View File

@@ -40,9 +40,9 @@ from pipecat.processors.audio.vad_processor import VADProcessor
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.serializers.twilio import TwilioFrameSerializer
from pipecat.services.anthropic.llm import AnthropicLLMService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.kokoro.tts import KokoroTTSService
from pipecat.services.ollama.llm import OLLamaLLMService
from pipecat.services.whisper.stt import WhisperSTTService
from pipecat.transports.websocket.fastapi import (
FastAPIWebsocketParams,
FastAPIWebsocketTransport,
@@ -76,14 +76,21 @@ ENABLE_TOOLS = (
LLM_TEMPERATURE = float(os.environ.get("LLM_TEMPERATURE", "0.3"))
LLM_MAX_TOKENS = int(os.environ.get("LLM_MAX_TOKENS", "160"))
KOKORO_VOICE = os.environ.get("KOKORO_VOICE", "af_heart")
# Real-time STT is Deepgram Nova-2: end-of-utterance events in <300ms (vs Whisper's
# 1-3s of chunk buffering, the main cause of non-reply / repeat-yourself). Whisper
# large-v3 is retained for post-call transcription only (Phase 3).
DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY", "")
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "medium") # tiny|base|small|medium
WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cuda") # cuda for the 5080
WHISPER_COMPUTE = os.environ.get("WHISPER_COMPUTE", "float16")
# Bias transcription toward our domain vocabulary (office cities + optometry terms) so
# 8 kHz telephony audio doesn't turn "Hialeah" into "high allele" or "eye exam" into "hire".
WHISPER_HOTWORDS = os.environ.get(
"WHISPER_HOTWORDS",
"Advanced Vision Care, eye exam, annual exam, appointment, optometry, contact lens, "
"Hialeah, Kendall, Tamarac, Pembroke Pines, Lauderdale Lakes, Miami Gardens, Boca Raton",
)
# Twilio sends 8 kHz mu-law on the wire — we run the PIPELINE at 16 kHz and let
# TwilioFrameSerializer resample to/from the 8 kHz wire. (Silero VAD, Deepgram, and
# Kokoro are all happy at 16 kHz.)
# Twilio sends 8 kHz mu-law on the wire, but faster-whisper assumes any numpy array is
# 16 kHz — so we run the PIPELINE at 16 kHz and let TwilioFrameSerializer resample to/from
# the 8 kHz wire. Running the pipeline at 8 kHz makes Whisper hear 2x-speed audio and
# transcribe nothing. (Silero VAD + Kokoro are happy at 16 kHz too.)
WIRE_SAMPLE_RATE = 8000 # Twilio mu-law on the wire (serializer handles this)
PIPELINE_SAMPLE_RATE = 16000 # internal rate Whisper/VAD actually need
@@ -215,6 +222,35 @@ class AudioHeartbeat(FrameProcessor):
await self.push_frame(frame, direction)
class HintedWhisperSTTService(WhisperSTTService):
"""WhisperSTTService that biases transcription toward domain vocabulary via
faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap
the model's transcribe() for the duration of each call. Each call gets its own
Whisper instance, so this per-instance patch is race-free."""
def __init__(self, *args, hotwords: str | None = None, **kwargs):
super().__init__(*args, **kwargs)
self._hotwords = hotwords
async def run_stt(self, audio):
if self._hotwords and self._model is not None:
real = self._model.transcribe
def patched(audio_arg, **kw):
kw.setdefault("hotwords", self._hotwords)
return real(audio_arg, **kw)
self._model.transcribe = patched
try:
async for frame in super().run_stt(audio):
yield frame
finally:
self._model.transcribe = real
else:
async for frame in super().run_stt(audio):
yield frame
def build_llm_service():
"""Build the LLM service for the selected provider. The universal LLMContext +
aggregators work with either, so only this construction differs (true A/B swap)."""
@@ -250,16 +286,11 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
(Twilio Media Stream) and the browser path (WebRTC) — same prompt, model, voice, and
booking/hang-up logic; only the transport differs. do_capture writes the post-call
appointment to Odoo (on for phone; off for browser testing so it doesn't make cards)."""
stt = DeepgramSTTService(
api_key=DEEPGRAM_API_KEY,
settings=DeepgramSTTService.Settings(
model="nova-2",
language="en-US",
smart_format=True,
punctuate=True,
interim_results=False, # final transcripts only — avoids double-firing
utterance_end_ms=1000, # ms of silence before end-of-utterance fires
),
stt = HintedWhisperSTTService(
settings=WhisperSTTService.Settings(model=WHISPER_MODEL),
device=WHISPER_DEVICE,
compute_type=WHISPER_COMPUTE,
hotwords=WHISPER_HOTWORDS,
)
llm = build_llm_service()
# In-call booking tool — only registered when ENABLE_TOOLS is on (auto: Claude yes,