From 94e2ca1902a6d5817933a1ebbe5ba75d0bd420e1 Mon Sep 17 00:00:00 2001 From: tocmo0nlord Date: Sat, 4 Jul 2026 03:23:03 +0000 Subject: [PATCH] Cut smart-turn INCOMPLETE wait 3s -> 1.5s (SMART_TURN_STOP_SECS) Follow-up test call: no more cancelled replies, but 3-5s response gaps on turns the smart-turn model judged INCOMPLETE ("I'm due to my annual exam.") - it waited the library-default 3s of silence before triggering the LLM. Build the stop strategy explicitly with SmartTurnParams(stop_secs=1.5), env-tunable. A caller who really does resume just yields a follow-up turn, which is safe now that interruption broadcasts are off. Co-Authored-By: Claude Opus 4.8 --- .env.example | 3 +++ CLAUDE.md | 9 +++++++++ bot.py | 38 +++++++++++++++++++++++++++----------- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/.env.example b/.env.example index 3199d4c..f648600 100644 --- a/.env.example +++ b/.env.example @@ -68,6 +68,9 @@ VAD_CONFIDENCE=0.5 VAD_MIN_VOLUME=0.15 VAD_START_SECS=0.1 VAD_STOP_SECS=0.5 +# Extra silence the smart-turn analyzer waits when it judges an utterance INCOMPLETE +# before ending the turn anyway (library default 3s caused ~3.5s dead air on some turns). +#SMART_TURN_STOP_SECS=1.5 # Deterministic slot memory (callstate.py): injects an ALREADY-COLLECTED / STILL-NEEDED # checklist into the system prompt each turn + merges VAD-fragmented user turns, so the # local 8B stops re-asking for name/reason/phone. Default: on for ollama, off for anthropic. diff --git a/CLAUDE.md b/CLAUDE.md index 506ab4d..4fc24fb 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -96,6 +96,15 @@ interruption broadcasts at all (there's nothing legitimate for them to do in a n bot). `UserStartedSpeakingFrame` is still emitted, so the watchdog reset keeps working. If the caller talks over generation, both replies play in order instead of one being dropped. +**Smart-turn INCOMPLETE wait tuned 3s → 1.5s (2026-07-04, `SMART_TURN_STOP_SECS`).** The +follow-up test call had no cancellations but still 3–5s gaps on some turns: the smart-turn +model judged utterances like "I'm due to my annual exam." INCOMPLETE and then waited the +library-default 3s of silence ("End of Turn complete due to stop_secs. Silence in ms: 3032") +before triggering the LLM. The stop strategy is now built explicitly with +`LocalSmartTurnAnalyzerV3(params=SmartTurnParams(stop_secs=1.5))`. Worst-case perceived +response gap drops from ~3.5s+synthesis to ~2s+synthesis; a caller who really does resume +just produces a follow-up turn (safe now that interruptions are off). + **`CallStateGroomer` (`callstate.py`) — deterministic slot memory (2026-07-03).** Fixes the 8B re-asking for things the caller already gave (name, reason, phone — seen repeatedly in the historical call logs: "Didn't you say you had my phone number?", "I already gave you my full diff --git a/bot.py b/bot.py index d36e45b..8a56a51 100644 --- a/bot.py +++ b/bot.py @@ -43,10 +43,13 @@ from pipecat.processors.aggregators.llm_response_universal import ( LLMContextAggregatorPair, LLMUserAggregatorParams, ) +from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams +from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 from pipecat.turns.user_start import ( TranscriptionUserTurnStartStrategy, VADUserTurnStartStrategy, ) +from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy from pipecat.turns.user_turn_strategies import UserTurnStrategies from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor from pipecat.processors.audio.vad_processor import VADProcessor @@ -129,6 +132,12 @@ ECHO_TAIL_SECS = float(os.environ.get("ECHO_TAIL_SECS", "0.25")) SILENCE_WATCHDOG = os.environ.get("SILENCE_WATCHDOG", "true").lower() not in ("false", "0", "no") SILENCE_REPROMPT_SECS = float(os.environ.get("SILENCE_REPROMPT_SECS", "7.0")) MAX_REPROMPTS = int(os.environ.get("MAX_REPROMPTS", "2")) +# When the smart-turn model judges an utterance INCOMPLETE (trailing intonation), it waits +# this much extra silence before ending the turn anyway. The library default of 3s produced +# 3.5s of dead air on turns like "I'm due to my annual exam." (live call 2026-07-04). 1.5s +# keeps some room for the caller to finish a thought without the reply feeling stalled; with +# interruptions off, a caller who does continue simply gets a second reply in order. +SMART_TURN_STOP_SECS = float(os.environ.get("SMART_TURN_STOP_SECS", "1.5")) # Deterministic slot-state tracking (callstate.py): after each agent turn, extract what the # caller already provided and inject an explicit ALREADY-COLLECTED / STILL-NEEDED checklist # into the system message, plus merge VAD-fragmented user turns. Fixes the 8B re-asking for @@ -692,18 +701,25 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru # an interruption to do, so don't broadcast them at all. UserStartedSpeakingFrame is still # emitted (SilenceWatchdog reset keeps working); if the caller talks over generation, both # replies simply play in order instead of one being thrown away. - if HALF_DUPLEX: - user_params = LLMUserAggregatorParams( - user_turn_strategies=UserTurnStrategies( - start=[ - VADUserTurnStartStrategy(enable_interruptions=False), - TranscriptionUserTurnStartStrategy(enable_interruptions=False), - ], - ), + # Turn-stop: same smart-turn analyzer as the default, but with the INCOMPLETE-verdict + # silence wait tuned down from 3s (see SMART_TURN_STOP_SECS above). + stop_strategies = [ + TurnAnalyzerUserTurnStopStrategy( + turn_analyzer=LocalSmartTurnAnalyzerV3( + params=SmartTurnParams(stop_secs=SMART_TURN_STOP_SECS) + ) ) - agg = LLMContextAggregatorPair(context, user_params=user_params) - else: - agg = LLMContextAggregatorPair(context) + ] + user_params = LLMUserAggregatorParams( + user_turn_strategies=UserTurnStrategies( + start=[ + VADUserTurnStartStrategy(enable_interruptions=False), + TranscriptionUserTurnStartStrategy(enable_interruptions=False), + ] if HALF_DUPLEX else None, # None -> library defaults (interruptions on) + stop=stop_strategies, + ), + ) + agg = LLMContextAggregatorPair(context, user_params=user_params) # Deterministic slot memory: merges fragmented user turns + injects the live # collected/needed checklist into the system message before each generation. groomer = CallStateGroomer(