Read phone numbers, street numbers, and zips digit-by-digit in TTS
Kokoro spoke "983-4969" as "nine hundred eighty-three dash forty-nine sixty- nine". Added SpokenKokoroTTSService which normalizes text just before synthesis (run_tts gets the full sentence): US phone patterns and 4-5 digit runs (street numbers, zips) are spoken one digit at a time, country code dropped, no "dash"/ parens. Dates and times are left natural. Deterministic, so it's robust to whatever the model emits. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
45
bot.py
45
bot.py
@@ -282,6 +282,49 @@ class HintedWhisperSTTService(WhisperSTTService):
|
||||
yield frame
|
||||
|
||||
|
||||
# ── TTS number normalization ──────────────────────────────────────────────────
|
||||
# Kokoro reads digit strings as cardinals with symbols spoken aloud, e.g. "983-4969"
|
||||
# becomes "nine hundred eighty-three dash forty-nine sixty-nine". For a phone agent that
|
||||
# reads back phone numbers, street numbers, and zips, that's unusable. We normalize the
|
||||
# text right before synthesis (run_tts receives the full sentence) so phone numbers and
|
||||
# long digit runs are spoken one digit at a time, regardless of what the model emitted.
|
||||
_DIGIT_WORDS = {
|
||||
"0": "zero", "1": "one", "2": "two", "3": "three", "4": "four",
|
||||
"5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine",
|
||||
}
|
||||
_PHONE_RE = re.compile(r"(?:\+?1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
|
||||
_LONGNUM_RE = re.compile(r"\b\d{4,5}\b") # street numbers, zip codes
|
||||
|
||||
|
||||
def _say_digits(s: str) -> str:
|
||||
return " ".join(_DIGIT_WORDS[c] for c in s if c in _DIGIT_WORDS)
|
||||
|
||||
|
||||
def _phone_to_words(m: re.Match) -> str:
|
||||
d = re.sub(r"\D", "", m.group(0))
|
||||
if len(d) == 11 and d[0] == "1": # drop US country code
|
||||
d = d[1:]
|
||||
if len(d) == 10: # group as area / prefix / line for natural cadence
|
||||
return f"{_say_digits(d[:3])}, {_say_digits(d[3:6])}, {_say_digits(d[6:])}"
|
||||
return _say_digits(d)
|
||||
|
||||
|
||||
def tts_normalize(text: str) -> str:
|
||||
"""Make phone numbers, street numbers, and zips speak naturally (digit by digit)."""
|
||||
text = _PHONE_RE.sub(_phone_to_words, text)
|
||||
text = _LONGNUM_RE.sub(lambda m: _say_digits(m.group(0)), text)
|
||||
return text
|
||||
|
||||
|
||||
class SpokenKokoroTTSService(KokoroTTSService):
|
||||
"""KokoroTTSService that normalizes numbers to digit-by-digit speech before synthesis,
|
||||
so phone numbers/addresses/zips are read naturally instead of as cardinals + 'dash'."""
|
||||
|
||||
async def run_tts(self, text: str):
|
||||
async for frame in super().run_tts(tts_normalize(text)):
|
||||
yield frame
|
||||
|
||||
|
||||
def build_llm_service():
|
||||
"""Build the LLM service for the selected provider. The universal LLMContext +
|
||||
aggregators work with either, so only this construction differs (true A/B swap)."""
|
||||
@@ -351,7 +394,7 @@ async def run_agent(transport, caller_number=None, call_sid=None, do_capture=Tru
|
||||
|
||||
llm.register_function("record_appointment_request", _record_appointment)
|
||||
|
||||
tts = KokoroTTSService(
|
||||
tts = SpokenKokoroTTSService(
|
||||
model_path=os.path.join(MODEL_DIR, "kokoro-v1.0.onnx"),
|
||||
voices_path=os.path.join(MODEL_DIR, "voices-v1.0.bin"),
|
||||
settings=KokoroTTSService.Settings(voice=KOKORO_VOICE),
|
||||
|
||||
Reference in New Issue
Block a user