Read phone numbers, street numbers, and zips digit-by-digit in TTS

Kokoro spoke "983-4969" as "nine hundred eighty-three dash forty-nine sixty- nine". Added SpokenKokoroTTSService which normalizes text just before synthesis (run_tts gets the full sentence): US phone patterns and 4-5 digit runs (street numbers, zips) are spoken one digit at a time, country code dropped, no "dash"/ parens. Dates and times are left natural. Deterministic, so it's robust to whatever the model emits. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-25 04:17:54 +00:00
parent b31f685d91
commit 1204d24340
1 changed files with 44 additions and 1 deletions
--- a/bot.py
+++ b/bot.py
@@ -282,6 +282,49 @@ class HintedWhisperSTTService(WhisperSTTService):
                yield frame


+# ── TTS number normalization ──────────────────────────────────────────────────
+# Kokoro reads digit strings as cardinals with symbols spoken aloud, e.g. "983-4969"
+# becomes "nine hundred eighty-three dash forty-nine sixty-nine". For a phone agent that
+# reads back phone numbers, street numbers, and zips, that's unusable. We normalize the
+# text right before synthesis (run_tts receives the full sentence) so phone numbers and
+# long digit runs are spoken one digit at a time, regardless of what the model emitted.
+_DIGIT_WORDS = {
+    "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four",
+    "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine",
+}
+_PHONE_RE = re.compile(r"(?:\+?1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
+_LONGNUM_RE = re.compile(r"\b\d{4,5}\b")  # street numbers, zip codes
+
+
+def _say_digits(s: str) -> str:
+    return " ".join(_DIGIT_WORDS[c] for c in s if c in _DIGIT_WORDS)
+
+
+def _phone_to_words(m: re.Match) -> str:
+    d = re.sub(r"\D", "", m.group(0))
+    if len(d) == 11 and d[0] == "1":  # drop US country code
+        d = d[1:]
+    if len(d) == 10:  # group as area / prefix / line for natural cadence
+        return f"{_say_digits(d[:3])}, {_say_digits(d[3:6])}, {_say_digits(d[6:])}"
+    return _say_digits(d)
+
+
+def tts_normalize(text: str) -> str:
+    """Make phone numbers, street numbers, and zips speak naturally (digit by digit)."""
+    text = _PHONE_RE.sub(_phone_to_words, text)
+    text = _LONGNUM_RE.sub(lambda m: _say_digits(m.group(0)), text)
+    return text
+
+
+class SpokenKokoroTTSService(KokoroTTSService):
+    """KokoroTTSService that normalizes numbers to digit-by-digit speech before synthesis,
+    so phone numbers/addresses/zips are read naturally instead of as cardinals + 'dash'."""
+
+    async def run_tts(self, text: str):
+        async for frame in super().run_tts(tts_normalize(text)):
+            yield frame
+
+
 def build_llm_service():
    """Build the LLM service for the selected provider. The universal LLMContext +
    aggregators work with either, so only this construction differs (true A/B swap)."""
@@ -351,7 +394,7 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru

        llm.register_function("record_appointment_request", _record_appointment)

-    tts = KokoroTTSService(
+    tts = SpokenKokoroTTSService(
        model_path=os.path.join(MODEL_DIR, "kokoro-v1.0.onnx"),
        voices_path=os.path.join(MODEL_DIR, "voices-v1.0.bin"),
        settings=KokoroTTSService.Settings(voice=KOKORO_VOICE),