Revert Phase 1 STT/auth swaps: stay on Whisper + Twilio Auth Token

Deepgram and the Twilio Standard API Key were reverted per decision:
- bot.py: restore HintedWhisperSTTService (faster-whisper hotwords), default
  model medium; remove DeepgramSTTService import + DEEPGRAM_API_KEY.
- server.py: restore TWILIO_AUTH_TOKEN for X-Twilio-Signature validation and
  the serializer auto-hang-up. Twilio signs webhooks with the Auth Token, so
  an API Key Secret cannot validate signatures.
- .env.example: back to TWILIO_AUTH_TOKEN + Whisper STT vars.
- .gitignore: ignore runtime *.log (avc_run.log).

OLLAMA_MODEL stays activeblue-avc:latest (the existing pulled tag).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
tocmo0nlord
2026-06-25 01:06:24 +00:00
parent 004ef3bdc0
commit 5ed641255c
4 changed files with 74 additions and 51 deletions

View File

@@ -9,15 +9,12 @@ PORT=8200
BIND_HOST=127.0.0.1
# ── Twilio ───────────────────────────────────────────────────────────────────
# From console.twilio.com. Account SID + a Standard API Key (scoped to this app,
# revocable independently). The Auth Token stays in the Twilio console only — never on
# this server. Create the key under Account → API Keys → Create Standard key, name it
# avc-phone-agent-prod; the Secret is shown once. Used to auto-hang-up the carrier leg
# and validate inbound webhook signatures.
# From console.twilio.com. Used to auto-hang-up the carrier leg and (recommended)
# validate inbound webhook signatures. Twilio signs webhooks with the Auth Token, so
# signature validation must use the Auth Token (not an API Key Secret).
TWILIO_ACCOUNT_SID=ACxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
TWILIO_API_KEY_SID=SKxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
TWILIO_API_KEY_SECRET=your_api_key_secret_here
# Inbound webhook signature validation is ON whenever TWILIO_API_KEY_SECRET is set.
TWILIO_AUTH_TOKEN=your_auth_token_here
# Inbound webhook signature validation is ON whenever TWILIO_AUTH_TOKEN is set.
# Set to false only for local testing without real Twilio requests.
TWILIO_VALIDATE=true
# Shared secret embedded in the Media Stream wss URL to gate /ws. Set a stable random
@@ -49,12 +46,7 @@ ANTHROPIC_API_KEY=
# Default is the most capable model; for low-latency phone voice prefer claude-haiku-4-5
# (fastest) or claude-sonnet-4-6 (balance).
ANTHROPIC_MODEL=claude-opus-4-8
# ── STT: Deepgram (real-time, in-call only) ──────────────────────────────────
# Nova-2 delivers end-of-utterance in <300ms (vs Whisper's 1-3s buffering). Key from
# console.deepgram.com. Model is fixed to nova-2 in code; DEEPGRAM_MODEL is informational.
DEEPGRAM_API_KEY=
DEEPGRAM_MODEL=nova-2
# Whisper is retained for POST-CALL transcription only (Phase 3), not the live pipeline.
# ── STT: Whisper (faster-whisper, real-time in-call) ─────────────────────────
WHISPER_MODEL=base
WHISPER_DEVICE=cuda
WHISPER_COMPUTE=float16

4
.gitignore vendored
View File

@@ -1,6 +1,10 @@
# Secrets — never commit
.env
# Runtime logs
avc_run.log
*.log
# Recordings (local only, may contain PHI)
recordings/

67
bot.py
View File

@@ -40,9 +40,9 @@ from pipecat.processors.audio.vad_processor import VADProcessor
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.serializers.twilio import TwilioFrameSerializer
from pipecat.services.anthropic.llm import AnthropicLLMService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.kokoro.tts import KokoroTTSService
from pipecat.services.ollama.llm import OLLamaLLMService
from pipecat.services.whisper.stt import WhisperSTTService
from pipecat.transports.websocket.fastapi import (
FastAPIWebsocketParams,
FastAPIWebsocketTransport,
@@ -76,14 +76,21 @@ ENABLE_TOOLS = (
LLM_TEMPERATURE = float(os.environ.get("LLM_TEMPERATURE", "0.3"))
LLM_MAX_TOKENS = int(os.environ.get("LLM_MAX_TOKENS", "160"))
KOKORO_VOICE = os.environ.get("KOKORO_VOICE", "af_heart")
# Real-time STT is Deepgram Nova-2: end-of-utterance events in <300ms (vs Whisper's
# 1-3s of chunk buffering, the main cause of non-reply / repeat-yourself). Whisper
# large-v3 is retained for post-call transcription only (Phase 3).
DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY", "")
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "medium") # tiny|base|small|medium
WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cuda") # cuda for the 5080
WHISPER_COMPUTE = os.environ.get("WHISPER_COMPUTE", "float16")
# Bias transcription toward our domain vocabulary (office cities + optometry terms) so
# 8 kHz telephony audio doesn't turn "Hialeah" into "high allele" or "eye exam" into "hire".
WHISPER_HOTWORDS = os.environ.get(
"WHISPER_HOTWORDS",
"Advanced Vision Care, eye exam, annual exam, appointment, optometry, contact lens, "
"Hialeah, Kendall, Tamarac, Pembroke Pines, Lauderdale Lakes, Miami Gardens, Boca Raton",
)
# Twilio sends 8 kHz mu-law on the wire — we run the PIPELINE at 16 kHz and let
# TwilioFrameSerializer resample to/from the 8 kHz wire. (Silero VAD, Deepgram, and
# Kokoro are all happy at 16 kHz.)
# Twilio sends 8 kHz mu-law on the wire, but faster-whisper assumes any numpy array is
# 16 kHz — so we run the PIPELINE at 16 kHz and let TwilioFrameSerializer resample to/from
# the 8 kHz wire. Running the pipeline at 8 kHz makes Whisper hear 2x-speed audio and
# transcribe nothing. (Silero VAD + Kokoro are happy at 16 kHz too.)
WIRE_SAMPLE_RATE = 8000 # Twilio mu-law on the wire (serializer handles this)
PIPELINE_SAMPLE_RATE = 16000 # internal rate Whisper/VAD actually need
@@ -215,6 +222,35 @@ class AudioHeartbeat(FrameProcessor):
await self.push_frame(frame, direction)
class HintedWhisperSTTService(WhisperSTTService):
"""WhisperSTTService that biases transcription toward domain vocabulary via
faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap
the model's transcribe() for the duration of each call. Each call gets its own
Whisper instance, so this per-instance patch is race-free."""
def __init__(self, *args, hotwords: str | None = None, **kwargs):
super().__init__(*args, **kwargs)
self._hotwords = hotwords
async def run_stt(self, audio):
if self._hotwords and self._model is not None:
real = self._model.transcribe
def patched(audio_arg, **kw):
kw.setdefault("hotwords", self._hotwords)
return real(audio_arg, **kw)
self._model.transcribe = patched
try:
async for frame in super().run_stt(audio):
yield frame
finally:
self._model.transcribe = real
else:
async for frame in super().run_stt(audio):
yield frame
def build_llm_service():
"""Build the LLM service for the selected provider. The universal LLMContext +
aggregators work with either, so only this construction differs (true A/B swap)."""
@@ -250,16 +286,11 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
(Twilio Media Stream) and the browser path (WebRTC) — same prompt, model, voice, and
booking/hang-up logic; only the transport differs. do_capture writes the post-call
appointment to Odoo (on for phone; off for browser testing so it doesn't make cards)."""
stt = DeepgramSTTService(
api_key=DEEPGRAM_API_KEY,
settings=DeepgramSTTService.Settings(
model="nova-2",
language="en-US",
smart_format=True,
punctuate=True,
interim_results=False, # final transcripts only — avoids double-firing
utterance_end_ms=1000, # ms of silence before end-of-utterance fires
),
stt = HintedWhisperSTTService(
settings=WhisperSTTService.Settings(model=WHISPER_MODEL),
device=WHISPER_DEVICE,
compute_type=WHISPER_COMPUTE,
hotwords=WHISPER_HOTWORDS,
)
llm = build_llm_service()
# In-call booking tool — only registered when ENABLE_TOOLS is on (auto: Claude yes,

View File

@@ -10,8 +10,8 @@ Two endpoints, both reached by Twilio over your public Traefik domain:
Security:
- POST /voice is authenticated with Twilio's X-Twilio-Signature (HMAC-SHA1 over the
public URL + sorted POST params, keyed by the API Key Secret). Enforced whenever
TWILIO_API_KEY_SECRET is set; set TWILIO_VALIDATE=false to bypass for local testing.
public URL + sorted POST params, keyed by the auth token). Enforced whenever
TWILIO_AUTH_TOKEN is set; set TWILIO_VALIDATE=false to bypass for local testing.
- WS /ws can't carry an X-Twilio-Signature usefully, so we gate it with a shared
STREAM_TOKEN embedded in the wss URL we hand Twilio in the TwiML.
@@ -44,20 +44,16 @@ BIND_HOST = os.environ.get("BIND_HOST", "127.0.0.1")
# Twilio REST creds — let the serializer auto-hang-up the carrier leg on EndFrame,
# and validate inbound webhook signatures.
TWILIO_ACCOUNT_SID = os.environ.get("TWILIO_ACCOUNT_SID")
# Standard API Key (scoped to this app, revocable independently) instead of the account
# master Auth Token. The Secret is used both for HMAC webhook-signature validation and as
# the serializer credential for auto-hang-up.
TWILIO_API_KEY_SID = os.environ.get("TWILIO_API_KEY_SID")
TWILIO_API_KEY_SECRET = os.environ.get("TWILIO_API_KEY_SECRET")
# Signature validation is ON by default when the API key secret exists; explicit opt-out.
TWILIO_AUTH_TOKEN = os.environ.get("TWILIO_AUTH_TOKEN")
# Signature validation is ON by default when an auth token exists; explicit opt-out.
TWILIO_VALIDATE = os.environ.get("TWILIO_VALIDATE", "true").lower() not in ("false", "0", "no")
# Shared secret embedded in the Media Stream wss URL to gate /ws. Auto-generated if
# unset (fine for a single process), but set it in .env for stability across restarts.
STREAM_TOKEN = os.environ.get("STREAM_TOKEN") or secrets.token_urlsafe(24)
# Max simultaneous live calls. Each call holds an Ollama context on the 16GB GPU and
# Ollama serializes generation, so cap this to protect call quality.
# Max simultaneous live calls. Each call loads a Whisper model + an Ollama context on
# the 16GB GPU and Ollama serializes generation, so cap this to protect call quality.
# Over-cap callers hear BUSY_MESSAGE and are hung up — existing calls are never degraded.
MAX_CONCURRENT_CALLS = int(os.environ.get("MAX_CONCURRENT_CALLS", "2"))
BUSY_MESSAGE = os.environ.get(
@@ -93,12 +89,12 @@ def _twilio_signature_ok(url: str, params: dict, header_sig: str) -> bool:
"""Recompute Twilio's request signature and compare in constant time.
Algorithm (Twilio docs): take the full public URL, append each POST param as
key+value sorted by key, HMAC-SHA1 with the API Key Secret, base64-encode.
key+value sorted by key, HMAC-SHA1 with the auth token, base64-encode.
"""
if not (TWILIO_API_KEY_SECRET and header_sig):
if not (TWILIO_AUTH_TOKEN and header_sig):
return False
payload = url + "".join(f"{k}{params[k]}" for k in sorted(params))
digest = hmac.new(TWILIO_API_KEY_SECRET.encode(), payload.encode("utf-8"), hashlib.sha1).digest()
digest = hmac.new(TWILIO_AUTH_TOKEN.encode(), payload.encode("utf-8"), hashlib.sha1).digest()
expected = base64.b64encode(digest).decode()
return hmac.compare_digest(expected, header_sig)
@@ -108,7 +104,7 @@ async def health():
return {
"status": "ok",
"public_host": PUBLIC_HOST,
"validate": TWILIO_VALIDATE and bool(TWILIO_API_KEY_SECRET),
"validate": TWILIO_VALIDATE and bool(TWILIO_AUTH_TOKEN),
"active_calls": _active_calls,
"max_calls": MAX_CONCURRENT_CALLS,
}
@@ -118,15 +114,15 @@ async def health():
async def voice(request: Request):
"""TwiML: connect the call to our Media Stream WebSocket (bidirectional)."""
form = dict(await request.form())
if TWILIO_VALIDATE and TWILIO_API_KEY_SECRET:
if TWILIO_VALIDATE and TWILIO_AUTH_TOKEN:
# Validate against the PUBLIC url Twilio actually signed, not the internal one.
public_url = f"https://{PUBLIC_HOST}/voice"
sig = request.headers.get("X-Twilio-Signature", "")
if not _twilio_signature_ok(public_url, form, sig):
logger.warning("Rejected /voice: bad or missing X-Twilio-Signature")
return HTMLResponse(status_code=403, content="forbidden")
elif not TWILIO_API_KEY_SECRET:
logger.warning("/voice signature validation DISABLED (no TWILIO_API_KEY_SECRET set)")
elif not TWILIO_AUTH_TOKEN:
logger.warning("/voice signature validation DISABLED (no TWILIO_AUTH_TOKEN set)")
caller = form.get("From", "") # caller-ID; passed through for appointment callback
@@ -199,7 +195,7 @@ async def media_stream(websocket: WebSocket):
stream_sid=stream_sid,
call_sid=call_sid,
account_sid=TWILIO_ACCOUNT_SID,
auth_token=TWILIO_API_KEY_SECRET,
auth_token=TWILIO_AUTH_TOKEN,
)
await run_call(websocket, serializer, caller_number=caller_number, call_sid=call_sid)
except Exception:
@@ -214,5 +210,5 @@ if __name__ == "__main__":
import uvicorn
logger.info(f"AVC phone agent on {BIND_HOST}:{PORT} | public={PUBLIC_HOST} | "
f"sig_validation={'on' if (TWILIO_VALIDATE and TWILIO_API_KEY_SECRET) else 'OFF'}")
f"sig_validation={'on' if (TWILIO_VALIDATE and TWILIO_AUTH_TOKEN) else 'OFF'}")
uvicorn.run(app, host=BIND_HOST, port=PORT)