fix: strip markdown for TTS + render rich markdown in chat UI
- _md_to_speech(): AST-based markdown stripping via markdown-it-py - Truncate TTS input at 450 chars on sentence boundary (chatterbox overflow) - chat.js: render assistant messages as markdown via marked.js + DOMPurify - Typewriter accumulates raw text, renders markdown progressively
This commit is contained in:
+55
-1
@@ -939,17 +939,71 @@ async def _send_audio(ws: WebSocket, text: str) -> None:
|
||||
await _send_audio_with_voice(ws, text, tts.voice)
|
||||
|
||||
|
||||
_TTS_MAX_CHARS = 450 # chatterbox s3-tokenizer overflows beyond ~500 chars
|
||||
|
||||
|
||||
def _md_to_speech(text: str) -> str:
|
||||
"""Strip markdown to plain speech-ready text, capped for TTS safety.
|
||||
|
||||
1. Parse markdown AST — skip code blocks, horizontal rules.
|
||||
2. Extract plain text from inline nodes.
|
||||
3. Truncate at sentence boundary near _TTS_MAX_CHARS to avoid
|
||||
chatterbox token overflow (garbled audio on long inputs).
|
||||
"""
|
||||
import re
|
||||
from markdown_it import MarkdownIt
|
||||
|
||||
md = MarkdownIt()
|
||||
tokens = md.parse(text)
|
||||
parts: list[str] = []
|
||||
for token in tokens:
|
||||
if token.type in ("fence", "hr", "code_block"):
|
||||
continue
|
||||
if token.type in ("paragraph_close", "heading_close", "blockquote_close", "list_item_close"):
|
||||
parts.append(" ")
|
||||
continue
|
||||
if token.children:
|
||||
for child in token.children:
|
||||
if child.type == "text":
|
||||
parts.append(child.content)
|
||||
elif child.type == "code_inline":
|
||||
parts.append(child.content)
|
||||
elif child.type == "softbreak":
|
||||
parts.append(" ")
|
||||
elif token.type == "inline" and token.content and not token.children:
|
||||
parts.append(token.content)
|
||||
clean = " ".join("".join(parts).split()).strip()
|
||||
|
||||
# Truncate at sentence boundary if too long
|
||||
if len(clean) <= _TTS_MAX_CHARS:
|
||||
return clean
|
||||
# Find last sentence-ending punctuation before the limit
|
||||
truncated = clean[:_TTS_MAX_CHARS]
|
||||
m = re.search(r"[.!?](?:\s|$)", truncated[::-1])
|
||||
if m:
|
||||
cut = _TTS_MAX_CHARS - m.start()
|
||||
return clean[:cut].strip()
|
||||
# No sentence boundary — hard cut at last space
|
||||
last_space = truncated.rfind(" ")
|
||||
if last_space > 200:
|
||||
return truncated[:last_space].strip()
|
||||
return truncated.strip()
|
||||
|
||||
|
||||
async def _send_audio_with_voice(ws: WebSocket, text: str, voice_id: str) -> None:
|
||||
"""Synthesize text in a specific voice and ship as audio. Used post-calibration."""
|
||||
if not TTS_ENABLED:
|
||||
return
|
||||
import base64
|
||||
try:
|
||||
speech_text = _md_to_speech(text) if text else ""
|
||||
if not speech_text:
|
||||
return
|
||||
# spin up a per-voice synthesizer (cheap — just object init)
|
||||
per_voice = TTS(voice=voice_id) if voice_id != (tts.voice if tts else "") else tts
|
||||
if not per_voice or not per_voice.available:
|
||||
return
|
||||
wav = await per_voice.synthesize(text)
|
||||
wav = await per_voice.synthesize(speech_text)
|
||||
if not wav:
|
||||
return
|
||||
await ws.send_json({
|
||||
|
||||
Reference in New Issue
Block a user