Revert Phase 1 STT/auth swaps: stay on Whisper + Twilio Auth Token
Deepgram and the Twilio Standard API Key were reverted per decision: - bot.py: restore HintedWhisperSTTService (faster-whisper hotwords), default model medium; remove DeepgramSTTService import + DEEPGRAM_API_KEY. - server.py: restore TWILIO_AUTH_TOKEN for X-Twilio-Signature validation and the serializer auto-hang-up. Twilio signs webhooks with the Auth Token, so an API Key Secret cannot validate signatures. - .env.example: back to TWILIO_AUTH_TOKEN + Whisper STT vars. - .gitignore: ignore runtime *.log (avc_run.log). OLLAMA_MODEL stays activeblue-avc:latest (the existing pulled tag). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
20
.env.example
20
.env.example
@@ -9,15 +9,12 @@ PORT=8200
|
||||
BIND_HOST=127.0.0.1
|
||||
|
||||
# ── Twilio ───────────────────────────────────────────────────────────────────
|
||||
# From console.twilio.com. Account SID + a Standard API Key (scoped to this app,
|
||||
# revocable independently). The Auth Token stays in the Twilio console only — never on
|
||||
# this server. Create the key under Account → API Keys → Create Standard key, name it
|
||||
# avc-phone-agent-prod; the Secret is shown once. Used to auto-hang-up the carrier leg
|
||||
# and validate inbound webhook signatures.
|
||||
# From console.twilio.com. Used to auto-hang-up the carrier leg and (recommended)
|
||||
# validate inbound webhook signatures. Twilio signs webhooks with the Auth Token, so
|
||||
# signature validation must use the Auth Token (not an API Key Secret).
|
||||
TWILIO_ACCOUNT_SID=ACxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
TWILIO_API_KEY_SID=SKxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
TWILIO_API_KEY_SECRET=your_api_key_secret_here
|
||||
# Inbound webhook signature validation is ON whenever TWILIO_API_KEY_SECRET is set.
|
||||
TWILIO_AUTH_TOKEN=your_auth_token_here
|
||||
# Inbound webhook signature validation is ON whenever TWILIO_AUTH_TOKEN is set.
|
||||
# Set to false only for local testing without real Twilio requests.
|
||||
TWILIO_VALIDATE=true
|
||||
# Shared secret embedded in the Media Stream wss URL to gate /ws. Set a stable random
|
||||
@@ -49,12 +46,7 @@ ANTHROPIC_API_KEY=
|
||||
# Default is the most capable model; for low-latency phone voice prefer claude-haiku-4-5
|
||||
# (fastest) or claude-sonnet-4-6 (balance).
|
||||
ANTHROPIC_MODEL=claude-opus-4-8
|
||||
# ── STT: Deepgram (real-time, in-call only) ──────────────────────────────────
|
||||
# Nova-2 delivers end-of-utterance in <300ms (vs Whisper's 1-3s buffering). Key from
|
||||
# console.deepgram.com. Model is fixed to nova-2 in code; DEEPGRAM_MODEL is informational.
|
||||
DEEPGRAM_API_KEY=
|
||||
DEEPGRAM_MODEL=nova-2
|
||||
# Whisper is retained for POST-CALL transcription only (Phase 3), not the live pipeline.
|
||||
# ── STT: Whisper (faster-whisper, real-time in-call) ─────────────────────────
|
||||
WHISPER_MODEL=base
|
||||
WHISPER_DEVICE=cuda
|
||||
WHISPER_COMPUTE=float16
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,6 +1,10 @@
|
||||
# Secrets — never commit
|
||||
.env
|
||||
|
||||
# Runtime logs
|
||||
avc_run.log
|
||||
*.log
|
||||
|
||||
# Recordings (local only, may contain PHI)
|
||||
recordings/
|
||||
|
||||
|
||||
67
bot.py
67
bot.py
@@ -40,9 +40,9 @@ from pipecat.processors.audio.vad_processor import VADProcessor
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
from pipecat.serializers.twilio import TwilioFrameSerializer
|
||||
from pipecat.services.anthropic.llm import AnthropicLLMService
|
||||
from pipecat.services.deepgram.stt import DeepgramSTTService
|
||||
from pipecat.services.kokoro.tts import KokoroTTSService
|
||||
from pipecat.services.ollama.llm import OLLamaLLMService
|
||||
from pipecat.services.whisper.stt import WhisperSTTService
|
||||
from pipecat.transports.websocket.fastapi import (
|
||||
FastAPIWebsocketParams,
|
||||
FastAPIWebsocketTransport,
|
||||
@@ -76,14 +76,21 @@ ENABLE_TOOLS = (
|
||||
LLM_TEMPERATURE = float(os.environ.get("LLM_TEMPERATURE", "0.3"))
|
||||
LLM_MAX_TOKENS = int(os.environ.get("LLM_MAX_TOKENS", "160"))
|
||||
KOKORO_VOICE = os.environ.get("KOKORO_VOICE", "af_heart")
|
||||
# Real-time STT is Deepgram Nova-2: end-of-utterance events in <300ms (vs Whisper's
|
||||
# 1-3s of chunk buffering, the main cause of non-reply / repeat-yourself). Whisper
|
||||
# large-v3 is retained for post-call transcription only (Phase 3).
|
||||
DEEPGRAM_API_KEY = os.environ.get("DEEPGRAM_API_KEY", "")
|
||||
WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "medium") # tiny|base|small|medium
|
||||
WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cuda") # cuda for the 5080
|
||||
WHISPER_COMPUTE = os.environ.get("WHISPER_COMPUTE", "float16")
|
||||
# Bias transcription toward our domain vocabulary (office cities + optometry terms) so
|
||||
# 8 kHz telephony audio doesn't turn "Hialeah" into "high allele" or "eye exam" into "hire".
|
||||
WHISPER_HOTWORDS = os.environ.get(
|
||||
"WHISPER_HOTWORDS",
|
||||
"Advanced Vision Care, eye exam, annual exam, appointment, optometry, contact lens, "
|
||||
"Hialeah, Kendall, Tamarac, Pembroke Pines, Lauderdale Lakes, Miami Gardens, Boca Raton",
|
||||
)
|
||||
|
||||
# Twilio sends 8 kHz mu-law on the wire — we run the PIPELINE at 16 kHz and let
|
||||
# TwilioFrameSerializer resample to/from the 8 kHz wire. (Silero VAD, Deepgram, and
|
||||
# Kokoro are all happy at 16 kHz.)
|
||||
# Twilio sends 8 kHz mu-law on the wire, but faster-whisper assumes any numpy array is
|
||||
# 16 kHz — so we run the PIPELINE at 16 kHz and let TwilioFrameSerializer resample to/from
|
||||
# the 8 kHz wire. Running the pipeline at 8 kHz makes Whisper hear 2x-speed audio and
|
||||
# transcribe nothing. (Silero VAD + Kokoro are happy at 16 kHz too.)
|
||||
WIRE_SAMPLE_RATE = 8000 # Twilio mu-law on the wire (serializer handles this)
|
||||
PIPELINE_SAMPLE_RATE = 16000 # internal rate Whisper/VAD actually need
|
||||
|
||||
@@ -215,6 +222,35 @@ class AudioHeartbeat(FrameProcessor):
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
|
||||
class HintedWhisperSTTService(WhisperSTTService):
|
||||
"""WhisperSTTService that biases transcription toward domain vocabulary via
|
||||
faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap
|
||||
the model's transcribe() for the duration of each call. Each call gets its own
|
||||
Whisper instance, so this per-instance patch is race-free."""
|
||||
|
||||
def __init__(self, *args, hotwords: str | None = None, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._hotwords = hotwords
|
||||
|
||||
async def run_stt(self, audio):
|
||||
if self._hotwords and self._model is not None:
|
||||
real = self._model.transcribe
|
||||
|
||||
def patched(audio_arg, **kw):
|
||||
kw.setdefault("hotwords", self._hotwords)
|
||||
return real(audio_arg, **kw)
|
||||
|
||||
self._model.transcribe = patched
|
||||
try:
|
||||
async for frame in super().run_stt(audio):
|
||||
yield frame
|
||||
finally:
|
||||
self._model.transcribe = real
|
||||
else:
|
||||
async for frame in super().run_stt(audio):
|
||||
yield frame
|
||||
|
||||
|
||||
def build_llm_service():
|
||||
"""Build the LLM service for the selected provider. The universal LLMContext +
|
||||
aggregators work with either, so only this construction differs (true A/B swap)."""
|
||||
@@ -250,16 +286,11 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
|
||||
(Twilio Media Stream) and the browser path (WebRTC) — same prompt, model, voice, and
|
||||
booking/hang-up logic; only the transport differs. do_capture writes the post-call
|
||||
appointment to Odoo (on for phone; off for browser testing so it doesn't make cards)."""
|
||||
stt = DeepgramSTTService(
|
||||
api_key=DEEPGRAM_API_KEY,
|
||||
settings=DeepgramSTTService.Settings(
|
||||
model="nova-2",
|
||||
language="en-US",
|
||||
smart_format=True,
|
||||
punctuate=True,
|
||||
interim_results=False, # final transcripts only — avoids double-firing
|
||||
utterance_end_ms=1000, # ms of silence before end-of-utterance fires
|
||||
),
|
||||
stt = HintedWhisperSTTService(
|
||||
settings=WhisperSTTService.Settings(model=WHISPER_MODEL),
|
||||
device=WHISPER_DEVICE,
|
||||
compute_type=WHISPER_COMPUTE,
|
||||
hotwords=WHISPER_HOTWORDS,
|
||||
)
|
||||
llm = build_llm_service()
|
||||
# In-call booking tool — only registered when ENABLE_TOOLS is on (auto: Claude yes,
|
||||
|
||||
34
server.py
34
server.py
@@ -10,8 +10,8 @@ Two endpoints, both reached by Twilio over your public Traefik domain:
|
||||
|
||||
Security:
|
||||
- POST /voice is authenticated with Twilio's X-Twilio-Signature (HMAC-SHA1 over the
|
||||
public URL + sorted POST params, keyed by the API Key Secret). Enforced whenever
|
||||
TWILIO_API_KEY_SECRET is set; set TWILIO_VALIDATE=false to bypass for local testing.
|
||||
public URL + sorted POST params, keyed by the auth token). Enforced whenever
|
||||
TWILIO_AUTH_TOKEN is set; set TWILIO_VALIDATE=false to bypass for local testing.
|
||||
- WS /ws can't carry an X-Twilio-Signature usefully, so we gate it with a shared
|
||||
STREAM_TOKEN embedded in the wss URL we hand Twilio in the TwiML.
|
||||
|
||||
@@ -44,20 +44,16 @@ BIND_HOST = os.environ.get("BIND_HOST", "127.0.0.1")
|
||||
# Twilio REST creds — let the serializer auto-hang-up the carrier leg on EndFrame,
|
||||
# and validate inbound webhook signatures.
|
||||
TWILIO_ACCOUNT_SID = os.environ.get("TWILIO_ACCOUNT_SID")
|
||||
# Standard API Key (scoped to this app, revocable independently) instead of the account
|
||||
# master Auth Token. The Secret is used both for HMAC webhook-signature validation and as
|
||||
# the serializer credential for auto-hang-up.
|
||||
TWILIO_API_KEY_SID = os.environ.get("TWILIO_API_KEY_SID")
|
||||
TWILIO_API_KEY_SECRET = os.environ.get("TWILIO_API_KEY_SECRET")
|
||||
# Signature validation is ON by default when the API key secret exists; explicit opt-out.
|
||||
TWILIO_AUTH_TOKEN = os.environ.get("TWILIO_AUTH_TOKEN")
|
||||
# Signature validation is ON by default when an auth token exists; explicit opt-out.
|
||||
TWILIO_VALIDATE = os.environ.get("TWILIO_VALIDATE", "true").lower() not in ("false", "0", "no")
|
||||
|
||||
# Shared secret embedded in the Media Stream wss URL to gate /ws. Auto-generated if
|
||||
# unset (fine for a single process), but set it in .env for stability across restarts.
|
||||
STREAM_TOKEN = os.environ.get("STREAM_TOKEN") or secrets.token_urlsafe(24)
|
||||
|
||||
# Max simultaneous live calls. Each call holds an Ollama context on the 16GB GPU and
|
||||
# Ollama serializes generation, so cap this to protect call quality.
|
||||
# Max simultaneous live calls. Each call loads a Whisper model + an Ollama context on
|
||||
# the 16GB GPU and Ollama serializes generation, so cap this to protect call quality.
|
||||
# Over-cap callers hear BUSY_MESSAGE and are hung up — existing calls are never degraded.
|
||||
MAX_CONCURRENT_CALLS = int(os.environ.get("MAX_CONCURRENT_CALLS", "2"))
|
||||
BUSY_MESSAGE = os.environ.get(
|
||||
@@ -93,12 +89,12 @@ def _twilio_signature_ok(url: str, params: dict, header_sig: str) -> bool:
|
||||
"""Recompute Twilio's request signature and compare in constant time.
|
||||
|
||||
Algorithm (Twilio docs): take the full public URL, append each POST param as
|
||||
key+value sorted by key, HMAC-SHA1 with the API Key Secret, base64-encode.
|
||||
key+value sorted by key, HMAC-SHA1 with the auth token, base64-encode.
|
||||
"""
|
||||
if not (TWILIO_API_KEY_SECRET and header_sig):
|
||||
if not (TWILIO_AUTH_TOKEN and header_sig):
|
||||
return False
|
||||
payload = url + "".join(f"{k}{params[k]}" for k in sorted(params))
|
||||
digest = hmac.new(TWILIO_API_KEY_SECRET.encode(), payload.encode("utf-8"), hashlib.sha1).digest()
|
||||
digest = hmac.new(TWILIO_AUTH_TOKEN.encode(), payload.encode("utf-8"), hashlib.sha1).digest()
|
||||
expected = base64.b64encode(digest).decode()
|
||||
return hmac.compare_digest(expected, header_sig)
|
||||
|
||||
@@ -108,7 +104,7 @@ async def health():
|
||||
return {
|
||||
"status": "ok",
|
||||
"public_host": PUBLIC_HOST,
|
||||
"validate": TWILIO_VALIDATE and bool(TWILIO_API_KEY_SECRET),
|
||||
"validate": TWILIO_VALIDATE and bool(TWILIO_AUTH_TOKEN),
|
||||
"active_calls": _active_calls,
|
||||
"max_calls": MAX_CONCURRENT_CALLS,
|
||||
}
|
||||
@@ -118,15 +114,15 @@ async def health():
|
||||
async def voice(request: Request):
|
||||
"""TwiML: connect the call to our Media Stream WebSocket (bidirectional)."""
|
||||
form = dict(await request.form())
|
||||
if TWILIO_VALIDATE and TWILIO_API_KEY_SECRET:
|
||||
if TWILIO_VALIDATE and TWILIO_AUTH_TOKEN:
|
||||
# Validate against the PUBLIC url Twilio actually signed, not the internal one.
|
||||
public_url = f"https://{PUBLIC_HOST}/voice"
|
||||
sig = request.headers.get("X-Twilio-Signature", "")
|
||||
if not _twilio_signature_ok(public_url, form, sig):
|
||||
logger.warning("Rejected /voice: bad or missing X-Twilio-Signature")
|
||||
return HTMLResponse(status_code=403, content="forbidden")
|
||||
elif not TWILIO_API_KEY_SECRET:
|
||||
logger.warning("/voice signature validation DISABLED (no TWILIO_API_KEY_SECRET set)")
|
||||
elif not TWILIO_AUTH_TOKEN:
|
||||
logger.warning("/voice signature validation DISABLED (no TWILIO_AUTH_TOKEN set)")
|
||||
|
||||
caller = form.get("From", "") # caller-ID; passed through for appointment callback
|
||||
|
||||
@@ -199,7 +195,7 @@ async def media_stream(websocket: WebSocket):
|
||||
stream_sid=stream_sid,
|
||||
call_sid=call_sid,
|
||||
account_sid=TWILIO_ACCOUNT_SID,
|
||||
auth_token=TWILIO_API_KEY_SECRET,
|
||||
auth_token=TWILIO_AUTH_TOKEN,
|
||||
)
|
||||
await run_call(websocket, serializer, caller_number=caller_number, call_sid=call_sid)
|
||||
except Exception:
|
||||
@@ -214,5 +210,5 @@ if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
logger.info(f"AVC phone agent on {BIND_HOST}:{PORT} | public={PUBLIC_HOST} | "
|
||||
f"sig_validation={'on' if (TWILIO_VALIDATE and TWILIO_API_KEY_SECRET) else 'OFF'}")
|
||||
f"sig_validation={'on' if (TWILIO_VALIDATE and TWILIO_AUTH_TOKEN) else 'OFF'}")
|
||||
uvicorn.run(app, host=BIND_HOST, port=PORT)
|
||||
|
||||
Reference in New Issue
Block a user