diff --git a/.env.example b/.env.example index 7e78eac..879528a 100644 --- a/.env.example +++ b/.env.example @@ -59,3 +59,7 @@ AGENT_NAME=AVA AGENT_NAME_SPOKEN=Eva # Grace pause after the goodbye before the carrier leg is dropped (seconds). HANGUP_DELAY_SECS=4.0 +# Half-duplex: ignore caller audio while the agent speaks (+ tail) so its own echo on the +# phone line can't trigger a false barge-in that cancels its reply. false = allow barge-in. +HALF_DUPLEX=true +ECHO_TAIL_SECS=0.5 diff --git a/CLAUDE.md b/CLAUDE.md index eb08e52..6f109f8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -62,6 +62,15 @@ VAD side. **`AudioHeartbeat`** — diagnostic processor that distinguishes VAD failure from transport stall. Keep it. +**`HalfDuplexGate` in `bot.py`** — fixes echo-induced mid-call silence. In this pipecat build +interruptions are VAD-driven and always on (`PipelineParams.allow_interruptions` does NOT exist +— it's silently ignored). On a phone line the agent's own TTS echoes back, the VAD reads it as +the caller speaking (it produces NO transcript), and the broadcast interruption cancels the +agent mid-reply → the caller hears silence. This gate sits BEFORE the VAD and withholds inbound +audio while the bot is speaking (+`ECHO_TAIL_SECS`, default 0.5s) so echo never reaches the VAD. +Trade-off: half-duplex — the caller can't barge in mid-utterance (fine for short replies). +`HALF_DUPLEX=false` restores barge-in. Keep it on for telephony. + **Post-call extraction (`extract.py`)** — single JSON-mode completion after call ends. Correctly uses `format: json`, uses verified Twilio caller-ID instead of trusting model output, falls back to JSONL if Odoo is unreachable. Keep it. diff --git a/bot.py b/bot.py index c88da71..cb59730 100644 --- a/bot.py +++ b/bot.py @@ -23,6 +23,7 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema from pipecat.audio.vad.silero import SileroVADAnalyzer from pipecat.audio.vad.vad_analyzer import VADParams from pipecat.frames.frames import ( + BotStartedSpeakingFrame, BotStoppedSpeakingFrame, EndFrame, EndTaskFrame, @@ -101,6 +102,11 @@ VAD_CONFIDENCE = float(os.environ.get("VAD_CONFIDENCE", "0.5")) VAD_MIN_VOLUME = float(os.environ.get("VAD_MIN_VOLUME", "0.3")) VAD_START_SECS = float(os.environ.get("VAD_START_SECS", "0.2")) VAD_STOP_SECS = float(os.environ.get("VAD_STOP_SECS", "0.5")) +# Half-duplex: ignore inbound audio while the agent is speaking (+ this tail in seconds) +# so the agent's own voice echoing back the phone line can't trigger a false barge-in that +# cancels its reply (= caller hears silence). Set HALF_DUPLEX=false to allow barge-in. +HALF_DUPLEX = os.environ.get("HALF_DUPLEX", "true").lower() not in ("false", "0", "no") +ECHO_TAIL_SECS = float(os.environ.get("ECHO_TAIL_SECS", "0.5")) # Agent persona name — purely for warmth; change/remove freely. AGENT_NAME = os.environ.get("AGENT_NAME", "Sofia") @@ -304,6 +310,35 @@ class AudioHeartbeat(FrameProcessor): await self.push_frame(frame, direction) +class HalfDuplexGate(FrameProcessor): + """Drops inbound audio while the agent is speaking (plus ECHO_TAIL_SECS after it stops). + + In this pipecat build interruptions are VAD-driven and always on (PipelineParams has no + allow_interruptions). On a phone line the agent's own TTS echoes back and the VAD reads it + as the caller speaking → it broadcasts an interruption that cancels the agent mid-reply, so + the caller hears silence. Sitting BEFORE the VAD, this gate withholds inbound audio frames + while the bot is speaking, so its echo never reaches the VAD. Trade-off: the caller can't + barge in mid-utterance (fine for short receptionist replies). Bypass with HALF_DUPLEX=false.""" + + def __init__(self, tail_secs: float = 0.5): + super().__init__() + self._bot_speaking = False + self._reopen_at = 0.0 + self._tail = tail_secs + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + if isinstance(frame, BotStartedSpeakingFrame): + self._bot_speaking = True + elif isinstance(frame, BotStoppedSpeakingFrame): + self._bot_speaking = False + self._reopen_at = time.time() + self._tail + # Withhold caller audio while the bot speaks (+ echo tail) so echo can't barge in. + if isinstance(frame, InputAudioRawFrame) and (self._bot_speaking or time.time() < self._reopen_at): + return + await self.push_frame(frame, direction) + + class HintedWhisperSTTService(WhisperSTTService): """WhisperSTTService that biases transcription toward domain vocabulary via faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap @@ -466,6 +501,7 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru min_volume=VAD_MIN_VOLUME, ))) heartbeat = AudioHeartbeat() + gate = HalfDuplexGate(tail_secs=ECHO_TAIL_SECS) if HALF_DUPLEX else None # Per-call system message = static prompt + the caller-ID number to confirm. Inject it # ALREADY spelled out digit-by-digit so the model repeats clean words instead of mangling @@ -505,6 +541,7 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru [ transport.input(), heartbeat, + *( [gate] if gate else [] ), # half-duplex echo gate, before the VAD vad, stt, agg.user(),