From 1204d243402e0002cf95f651a95fdacebd1ea59b Mon Sep 17 00:00:00 2001 From: tocmo0nlord Date: Thu, 25 Jun 2026 04:17:54 +0000 Subject: [PATCH] Read phone numbers, street numbers, and zips digit-by-digit in TTS Kokoro spoke "983-4969" as "nine hundred eighty-three dash forty-nine sixty- nine". Added SpokenKokoroTTSService which normalizes text just before synthesis (run_tts gets the full sentence): US phone patterns and 4-5 digit runs (street numbers, zips) are spoken one digit at a time, country code dropped, no "dash"/ parens. Dates and times are left natural. Deterministic, so it's robust to whatever the model emits. Co-Authored-By: Claude Opus 4.8 --- bot.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/bot.py b/bot.py index 1be4bfe..d06e33b 100644 --- a/bot.py +++ b/bot.py @@ -282,6 +282,49 @@ class HintedWhisperSTTService(WhisperSTTService): yield frame +# ── TTS number normalization ────────────────────────────────────────────────── +# Kokoro reads digit strings as cardinals with symbols spoken aloud, e.g. "983-4969" +# becomes "nine hundred eighty-three dash forty-nine sixty-nine". For a phone agent that +# reads back phone numbers, street numbers, and zips, that's unusable. We normalize the +# text right before synthesis (run_tts receives the full sentence) so phone numbers and +# long digit runs are spoken one digit at a time, regardless of what the model emitted. +_DIGIT_WORDS = { + "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four", + "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine", +} +_PHONE_RE = re.compile(r"(?:\+?1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}") +_LONGNUM_RE = re.compile(r"\b\d{4,5}\b") # street numbers, zip codes + + +def _say_digits(s: str) -> str: + return " ".join(_DIGIT_WORDS[c] for c in s if c in _DIGIT_WORDS) + + +def _phone_to_words(m: re.Match) -> str: + d = re.sub(r"\D", "", m.group(0)) + if len(d) == 11 and d[0] == "1": # drop US country code + d = d[1:] + if len(d) == 10: # group as area / prefix / line for natural cadence + return f"{_say_digits(d[:3])}, {_say_digits(d[3:6])}, {_say_digits(d[6:])}" + return _say_digits(d) + + +def tts_normalize(text: str) -> str: + """Make phone numbers, street numbers, and zips speak naturally (digit by digit).""" + text = _PHONE_RE.sub(_phone_to_words, text) + text = _LONGNUM_RE.sub(lambda m: _say_digits(m.group(0)), text) + return text + + +class SpokenKokoroTTSService(KokoroTTSService): + """KokoroTTSService that normalizes numbers to digit-by-digit speech before synthesis, + so phone numbers/addresses/zips are read naturally instead of as cardinals + 'dash'.""" + + async def run_tts(self, text: str): + async for frame in super().run_tts(tts_normalize(text)): + yield frame + + def build_llm_service(): """Build the LLM service for the selected provider. The universal LLMContext + aggregators work with either, so only this construction differs (true A/B swap).""" @@ -351,7 +394,7 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru llm.register_function("record_appointment_request", _record_appointment) - tts = KokoroTTSService( + tts = SpokenKokoroTTSService( model_path=os.path.join(MODEL_DIR, "kokoro-v1.0.onnx"), voices_path=os.path.join(MODEL_DIR, "voices-v1.0.bin"), settings=KokoroTTSService.Settings(voice=KOKORO_VOICE),