Revert Phase 1 STT/auth swaps: stay on Whisper + Twilio Auth Token
Deepgram and the Twilio Standard API Key were reverted per decision: - bot.py: restore HintedWhisperSTTService (faster-whisper hotwords), default model medium; remove DeepgramSTTService import + DEEPGRAM_API_KEY. - server.py: restore TWILIO_AUTH_TOKEN for X-Twilio-Signature validation and the serializer auto-hang-up. Twilio signs webhooks with the Auth Token, so an API Key Secret cannot validate signatures. - .env.example: back to TWILIO_AUTH_TOKEN + Whisper STT vars. - .gitignore: ignore runtime *.log (avc_run.log). OLLAMA_MODEL stays activeblue-avc:latest (the existing pulled tag). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
67
bot.py
67
bot.py
@@ -40,9 +40,9 @@ from pipecat.processors.audio.vad_processor import VADProcessor
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.serializers.twilio import TwilioFrameSerializer
|
||||
from pipecat.services.anthropic.llm import AnthropicLLMService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.kokoro.tts import KokoroTTSService
|
||||
from pipecat.services.ollama.llm import OLLamaLLMService
|
||||
from pipecat.services.whisper.stt import WhisperSTTService
|
||||
from pipecat.transports.websocket.fastapi import (
|
||||
FastAPIWebsocketParams,
|
||||
FastAPIWebsocketTransport,
|
||||
@@ -76,14 +76,21 @@ ENABLE_TOOLS = (
|
||||
LLM_TEMPERATURE = float(os.environ.get("LLM_TEMPERATURE", "0.3"))
|
||||
LLM_MAX_TOKENS = int(os.environ.get("LLM_MAX_TOKENS", "160"))
|
||||
KOKORO_VOICE = os.environ.get("KOKORO_VOICE", "af_heart")
|
||||
# Real-time STT is Deepgram Nova-2: end-of-utterance events in <300ms (vs Whisper's
|
||||
# 1-3s of chunk buffering, the main cause of non-reply / repeat-yourself). Whisper
|
||||
# large-v3 is retained for post-call transcription only (Phase 3).
|
||||
DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY", "")
|
||||
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "medium") # tiny|base|small|medium
|
||||
WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cuda") # cuda for the 5080
|
||||
WHISPER_COMPUTE = os.environ.get("WHISPER_COMPUTE", "float16")
|
||||
# Bias transcription toward our domain vocabulary (office cities + optometry terms) so
|
||||
# 8 kHz telephony audio doesn't turn "Hialeah" into "high allele" or "eye exam" into "hire".
|
||||
WHISPER_HOTWORDS = os.environ.get(
|
||||
"WHISPER_HOTWORDS",
|
||||
"Advanced Vision Care, eye exam, annual exam, appointment, optometry, contact lens, "
|
||||
"Hialeah, Kendall, Tamarac, Pembroke Pines, Lauderdale Lakes, Miami Gardens, Boca Raton",
|
||||
)
|
||||
|
||||
# Twilio sends 8 kHz mu-law on the wire — we run the PIPELINE at 16 kHz and let
|
||||
# TwilioFrameSerializer resample to/from the 8 kHz wire. (Silero VAD, Deepgram, and
|
||||
# Kokoro are all happy at 16 kHz.)
|
||||
# Twilio sends 8 kHz mu-law on the wire, but faster-whisper assumes any numpy array is
|
||||
# 16 kHz — so we run the PIPELINE at 16 kHz and let TwilioFrameSerializer resample to/from
|
||||
# the 8 kHz wire. Running the pipeline at 8 kHz makes Whisper hear 2x-speed audio and
|
||||
# transcribe nothing. (Silero VAD + Kokoro are happy at 16 kHz too.)
|
||||
WIRE_SAMPLE_RATE = 8000 # Twilio mu-law on the wire (serializer handles this)
|
||||
PIPELINE_SAMPLE_RATE = 16000 # internal rate Whisper/VAD actually need
|
||||
|
||||
@@ -215,6 +222,35 @@ class AudioHeartbeat(FrameProcessor):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class HintedWhisperSTTService(WhisperSTTService):
|
||||
"""WhisperSTTService that biases transcription toward domain vocabulary via
|
||||
faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap
|
||||
the model's transcribe() for the duration of each call. Each call gets its own
|
||||
Whisper instance, so this per-instance patch is race-free."""
|
||||
|
||||
def __init__(self, *args, hotwords: str | None = None, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._hotwords = hotwords
|
||||
|
||||
async def run_stt(self, audio):
|
||||
if self._hotwords and self._model is not None:
|
||||
real = self._model.transcribe
|
||||
|
||||
def patched(audio_arg, **kw):
|
||||
kw.setdefault("hotwords", self._hotwords)
|
||||
return real(audio_arg, **kw)
|
||||
|
||||
self._model.transcribe = patched
|
||||
try:
|
||||
async for frame in super().run_stt(audio):
|
||||
yield frame
|
||||
finally:
|
||||
self._model.transcribe = real
|
||||
else:
|
||||
async for frame in super().run_stt(audio):
|
||||
yield frame
|
||||
|
||||
|
||||
def build_llm_service():
|
||||
"""Build the LLM service for the selected provider. The universal LLMContext +
|
||||
aggregators work with either, so only this construction differs (true A/B swap)."""
|
||||
@@ -250,16 +286,11 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
|
||||
(Twilio Media Stream) and the browser path (WebRTC) — same prompt, model, voice, and
|
||||
booking/hang-up logic; only the transport differs. do_capture writes the post-call
|
||||
appointment to Odoo (on for phone; off for browser testing so it doesn't make cards)."""
|
||||
stt = DeepgramSTTService(
|
||||
api_key=DEEPGRAM_API_KEY,
|
||||
settings=DeepgramSTTService.Settings(
|
||||
model="nova-2",
|
||||
language="en-US",
|
||||
smart_format=True,
|
||||
punctuate=True,
|
||||
interim_results=False, # final transcripts only — avoids double-firing
|
||||
utterance_end_ms=1000, # ms of silence before end-of-utterance fires
|
||||
),
|
||||
stt = HintedWhisperSTTService(
|
||||
settings=WhisperSTTService.Settings(model=WHISPER_MODEL),
|
||||
device=WHISPER_DEVICE,
|
||||
compute_type=WHISPER_COMPUTE,
|
||||
hotwords=WHISPER_HOTWORDS,
|
||||
)
|
||||
llm = build_llm_service()
|
||||
# In-call booking tool — only registered when ENABLE_TOOLS is on (auto: Claude yes,
|
||||
|
||||
Reference in New Issue
Block a user