Fix echo-induced silence with a half-duplex audio gate
A caller's reply was generated but never heard: 0.65s after the agent started speaking, the VAD fired "user started speaking" (NO transcript) and broadcast an interruption that cancelled the agent's audio -> ~24s of silence until the caller spoke again. Cause: the agent's own TTS echoes back the phone line and the always-on VAD interruption treats it as a barge-in. (PipelineParams has no allow_interruptions in this pipecat build — it was a silent no-op.) Fix: HalfDuplexGate before the VAD withholds inbound audio while the bot speaks (+ECHO_TAIL_SECS, default 0.5s), so echo can't trigger a false barge-in. Half-duplex (no mid-utterance barge-in); HALF_DUPLEX=false to restore it. Runtime-tested the gate (pass idle / drop while speaking / drop in tail / resume). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -59,3 +59,7 @@ AGENT_NAME=AVA
|
|||||||
AGENT_NAME_SPOKEN=Eva
|
AGENT_NAME_SPOKEN=Eva
|
||||||
# Grace pause after the goodbye before the carrier leg is dropped (seconds).
|
# Grace pause after the goodbye before the carrier leg is dropped (seconds).
|
||||||
HANGUP_DELAY_SECS=4.0
|
HANGUP_DELAY_SECS=4.0
|
||||||
|
# Half-duplex: ignore caller audio while the agent speaks (+ tail) so its own echo on the
|
||||||
|
# phone line can't trigger a false barge-in that cancels its reply. false = allow barge-in.
|
||||||
|
HALF_DUPLEX=true
|
||||||
|
ECHO_TAIL_SECS=0.5
|
||||||
|
|||||||
@@ -62,6 +62,15 @@ VAD side.
|
|||||||
**`AudioHeartbeat`** — diagnostic processor that distinguishes VAD failure from
|
**`AudioHeartbeat`** — diagnostic processor that distinguishes VAD failure from
|
||||||
transport stall. Keep it.
|
transport stall. Keep it.
|
||||||
|
|
||||||
|
**`HalfDuplexGate` in `bot.py`** — fixes echo-induced mid-call silence. In this pipecat build
|
||||||
|
interruptions are VAD-driven and always on (`PipelineParams.allow_interruptions` does NOT exist
|
||||||
|
— it's silently ignored). On a phone line the agent's own TTS echoes back, the VAD reads it as
|
||||||
|
the caller speaking (it produces NO transcript), and the broadcast interruption cancels the
|
||||||
|
agent mid-reply → the caller hears silence. This gate sits BEFORE the VAD and withholds inbound
|
||||||
|
audio while the bot is speaking (+`ECHO_TAIL_SECS`, default 0.5s) so echo never reaches the VAD.
|
||||||
|
Trade-off: half-duplex — the caller can't barge in mid-utterance (fine for short replies).
|
||||||
|
`HALF_DUPLEX=false` restores barge-in. Keep it on for telephony.
|
||||||
|
|
||||||
**Post-call extraction (`extract.py`)** — single JSON-mode completion after call ends.
|
**Post-call extraction (`extract.py`)** — single JSON-mode completion after call ends.
|
||||||
Correctly uses `format: json`, uses verified Twilio caller-ID instead of trusting model
|
Correctly uses `format: json`, uses verified Twilio caller-ID instead of trusting model
|
||||||
output, falls back to JSONL if Odoo is unreachable. Keep it.
|
output, falls back to JSONL if Odoo is unreachable. Keep it.
|
||||||
|
|||||||
37
bot.py
37
bot.py
@@ -23,6 +23,7 @@ from pipecat.adapters.schemas.tools_schema import ToolsSchema
|
|||||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||||
from pipecat.frames.frames import (
|
from pipecat.frames.frames import (
|
||||||
|
BotStartedSpeakingFrame,
|
||||||
BotStoppedSpeakingFrame,
|
BotStoppedSpeakingFrame,
|
||||||
EndFrame,
|
EndFrame,
|
||||||
EndTaskFrame,
|
EndTaskFrame,
|
||||||
@@ -101,6 +102,11 @@ VAD_CONFIDENCE = float(os.environ.get("VAD_CONFIDENCE", "0.5"))
|
|||||||
VAD_MIN_VOLUME = float(os.environ.get("VAD_MIN_VOLUME", "0.3"))
|
VAD_MIN_VOLUME = float(os.environ.get("VAD_MIN_VOLUME", "0.3"))
|
||||||
VAD_START_SECS = float(os.environ.get("VAD_START_SECS", "0.2"))
|
VAD_START_SECS = float(os.environ.get("VAD_START_SECS", "0.2"))
|
||||||
VAD_STOP_SECS = float(os.environ.get("VAD_STOP_SECS", "0.5"))
|
VAD_STOP_SECS = float(os.environ.get("VAD_STOP_SECS", "0.5"))
|
||||||
|
# Half-duplex: ignore inbound audio while the agent is speaking (+ this tail in seconds)
|
||||||
|
# so the agent's own voice echoing back the phone line can't trigger a false barge-in that
|
||||||
|
# cancels its reply (= caller hears silence). Set HALF_DUPLEX=false to allow barge-in.
|
||||||
|
HALF_DUPLEX = os.environ.get("HALF_DUPLEX", "true").lower() not in ("false", "0", "no")
|
||||||
|
ECHO_TAIL_SECS = float(os.environ.get("ECHO_TAIL_SECS", "0.5"))
|
||||||
|
|
||||||
# Agent persona name — purely for warmth; change/remove freely.
|
# Agent persona name — purely for warmth; change/remove freely.
|
||||||
AGENT_NAME = os.environ.get("AGENT_NAME", "Sofia")
|
AGENT_NAME = os.environ.get("AGENT_NAME", "Sofia")
|
||||||
@@ -304,6 +310,35 @@ class AudioHeartbeat(FrameProcessor):
|
|||||||
await self.push_frame(frame, direction)
|
await self.push_frame(frame, direction)
|
||||||
|
|
||||||
|
|
||||||
|
class HalfDuplexGate(FrameProcessor):
|
||||||
|
"""Drops inbound audio while the agent is speaking (plus ECHO_TAIL_SECS after it stops).
|
||||||
|
|
||||||
|
In this pipecat build interruptions are VAD-driven and always on (PipelineParams has no
|
||||||
|
allow_interruptions). On a phone line the agent's own TTS echoes back and the VAD reads it
|
||||||
|
as the caller speaking → it broadcasts an interruption that cancels the agent mid-reply, so
|
||||||
|
the caller hears silence. Sitting BEFORE the VAD, this gate withholds inbound audio frames
|
||||||
|
while the bot is speaking, so its echo never reaches the VAD. Trade-off: the caller can't
|
||||||
|
barge in mid-utterance (fine for short receptionist replies). Bypass with HALF_DUPLEX=false."""
|
||||||
|
|
||||||
|
def __init__(self, tail_secs: float = 0.5):
|
||||||
|
super().__init__()
|
||||||
|
self._bot_speaking = False
|
||||||
|
self._reopen_at = 0.0
|
||||||
|
self._tail = tail_secs
|
||||||
|
|
||||||
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||||
|
await super().process_frame(frame, direction)
|
||||||
|
if isinstance(frame, BotStartedSpeakingFrame):
|
||||||
|
self._bot_speaking = True
|
||||||
|
elif isinstance(frame, BotStoppedSpeakingFrame):
|
||||||
|
self._bot_speaking = False
|
||||||
|
self._reopen_at = time.time() + self._tail
|
||||||
|
# Withhold caller audio while the bot speaks (+ echo tail) so echo can't barge in.
|
||||||
|
if isinstance(frame, InputAudioRawFrame) and (self._bot_speaking or time.time() < self._reopen_at):
|
||||||
|
return
|
||||||
|
await self.push_frame(frame, direction)
|
||||||
|
|
||||||
|
|
||||||
class HintedWhisperSTTService(WhisperSTTService):
|
class HintedWhisperSTTService(WhisperSTTService):
|
||||||
"""WhisperSTTService that biases transcription toward domain vocabulary via
|
"""WhisperSTTService that biases transcription toward domain vocabulary via
|
||||||
faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap
|
faster-whisper `hotwords`. Pipecat's service doesn't expose hotwords, so we wrap
|
||||||
@@ -466,6 +501,7 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
|
|||||||
min_volume=VAD_MIN_VOLUME,
|
min_volume=VAD_MIN_VOLUME,
|
||||||
)))
|
)))
|
||||||
heartbeat = AudioHeartbeat()
|
heartbeat = AudioHeartbeat()
|
||||||
|
gate = HalfDuplexGate(tail_secs=ECHO_TAIL_SECS) if HALF_DUPLEX else None
|
||||||
|
|
||||||
# Per-call system message = static prompt + the caller-ID number to confirm. Inject it
|
# Per-call system message = static prompt + the caller-ID number to confirm. Inject it
|
||||||
# ALREADY spelled out digit-by-digit so the model repeats clean words instead of mangling
|
# ALREADY spelled out digit-by-digit so the model repeats clean words instead of mangling
|
||||||
@@ -505,6 +541,7 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
|
|||||||
[
|
[
|
||||||
transport.input(),
|
transport.input(),
|
||||||
heartbeat,
|
heartbeat,
|
||||||
|
*( [gate] if gate else [] ), # half-duplex echo gate, before the VAD
|
||||||
vad,
|
vad,
|
||||||
stt,
|
stt,
|
||||||
agg.user(),
|
agg.user(),
|
||||||
|
|||||||
Reference in New Issue
Block a user