Read phone numbers, street numbers, and zips digit-by-digit in TTS

Kokoro spoke "983-4969" as "nine hundred eighty-three dash forty-nine sixty-
nine". Added SpokenKokoroTTSService which normalizes text just before synthesis
(run_tts gets the full sentence): US phone patterns and 4-5 digit runs (street
numbers, zips) are spoken one digit at a time, country code dropped, no "dash"/
parens. Dates and times are left natural. Deterministic, so it's robust to
whatever the model emits.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
tocmo0nlord
2026-06-25 04:17:54 +00:00
parent b31f685d91
commit 1204d24340

45
bot.py
View File

@@ -282,6 +282,49 @@ class HintedWhisperSTTService(WhisperSTTService):
yield frame
# ── TTS number normalization ──────────────────────────────────────────────────
# Kokoro reads digit strings as cardinals with symbols spoken aloud, e.g. "983-4969"
# becomes "nine hundred eighty-three dash forty-nine sixty-nine". For a phone agent that
# reads back phone numbers, street numbers, and zips, that's unusable. We normalize the
# text right before synthesis (run_tts receives the full sentence) so phone numbers and
# long digit runs are spoken one digit at a time, regardless of what the model emitted.
_DIGIT_WORDS = {
"0": "zero", "1": "one", "2": "two", "3": "three", "4": "four",
"5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine",
}
_PHONE_RE = re.compile(r"(?:\+?1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
_LONGNUM_RE = re.compile(r"\b\d{4,5}\b") # street numbers, zip codes
def _say_digits(s: str) -> str:
return " ".join(_DIGIT_WORDS[c] for c in s if c in _DIGIT_WORDS)
def _phone_to_words(m: re.Match) -> str:
d = re.sub(r"\D", "", m.group(0))
if len(d) == 11 and d[0] == "1": # drop US country code
d = d[1:]
if len(d) == 10: # group as area / prefix / line for natural cadence
return f"{_say_digits(d[:3])}, {_say_digits(d[3:6])}, {_say_digits(d[6:])}"
return _say_digits(d)
def tts_normalize(text: str) -> str:
"""Make phone numbers, street numbers, and zips speak naturally (digit by digit)."""
text = _PHONE_RE.sub(_phone_to_words, text)
text = _LONGNUM_RE.sub(lambda m: _say_digits(m.group(0)), text)
return text
class SpokenKokoroTTSService(KokoroTTSService):
"""KokoroTTSService that normalizes numbers to digit-by-digit speech before synthesis,
so phone numbers/addresses/zips are read naturally instead of as cardinals + 'dash'."""
async def run_tts(self, text: str):
async for frame in super().run_tts(tts_normalize(text)):
yield frame
def build_llm_service():
"""Build the LLM service for the selected provider. The universal LLMContext +
aggregators work with either, so only this construction differs (true A/B swap)."""
@@ -351,7 +394,7 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
llm.register_function("record_appointment_request", _record_appointment)
tts = KokoroTTSService(
tts = SpokenKokoroTTSService(
model_path=os.path.join(MODEL_DIR, "kokoro-v1.0.onnx"),
voices_path=os.path.join(MODEL_DIR, "voices-v1.0.bin"),
settings=KokoroTTSService.Settings(voice=KOKORO_VOICE),