fix: strip markdown for TTS + render rich markdown in chat UI

- _md_to_speech(): AST-based markdown stripping via markdown-it-py - Truncate TTS input at 450 chars on sentence boundary (chatterbox overflow) - chat.js: render assistant messages as markdown via marked.js + DOMPurify - Typewriter accumulates raw text, renders markdown progressively
2026-05-29 19:05:04 +02:00
parent 34295d2f14
commit 13bb1c354b
3 changed files with 78 additions and 3 deletions
@@ -939,17 +939,71 @@ async def _send_audio(ws: WebSocket, text: str) -> None:
    await _send_audio_with_voice(ws, text, tts.voice)


+_TTS_MAX_CHARS = 450  # chatterbox s3-tokenizer overflows beyond ~500 chars
+
+
+def _md_to_speech(text: str) -> str:
+    """Strip markdown to plain speech-ready text, capped for TTS safety.
+
+    1. Parse markdown AST — skip code blocks, horizontal rules.
+    2. Extract plain text from inline nodes.
+    3. Truncate at sentence boundary near _TTS_MAX_CHARS to avoid
+       chatterbox token overflow (garbled audio on long inputs).
+    """
+    import re
+    from markdown_it import MarkdownIt
+
+    md = MarkdownIt()
+    tokens = md.parse(text)
+    parts: list[str] = []
+    for token in tokens:
+        if token.type in ("fence", "hr", "code_block"):
+            continue
+        if token.type in ("paragraph_close", "heading_close", "blockquote_close", "list_item_close"):
+            parts.append(" ")
+            continue
+        if token.children:
+            for child in token.children:
+                if child.type == "text":
+                    parts.append(child.content)
+                elif child.type == "code_inline":
+                    parts.append(child.content)
+                elif child.type == "softbreak":
+                    parts.append(" ")
+        elif token.type == "inline" and token.content and not token.children:
+            parts.append(token.content)
+    clean = " ".join("".join(parts).split()).strip()
+
+    # Truncate at sentence boundary if too long
+    if len(clean) <= _TTS_MAX_CHARS:
+        return clean
+    # Find last sentence-ending punctuation before the limit
+    truncated = clean[:_TTS_MAX_CHARS]
+    m = re.search(r"[.!?](?:\s|$)", truncated[::-1])
+    if m:
+        cut = _TTS_MAX_CHARS - m.start()
+        return clean[:cut].strip()
+    # No sentence boundary — hard cut at last space
+    last_space = truncated.rfind(" ")
+    if last_space > 200:
+        return truncated[:last_space].strip()
+    return truncated.strip()
+
+
 async def _send_audio_with_voice(ws: WebSocket, text: str, voice_id: str) -> None:
    """Synthesize text in a specific voice and ship as audio. Used post-calibration."""
    if not TTS_ENABLED:
        return
    import base64
    try:
+        speech_text = _md_to_speech(text) if text else ""
+        if not speech_text:
+            return
        # spin up a per-voice synthesizer (cheap — just object init)
        per_voice = TTS(voice=voice_id) if voice_id != (tts.voice if tts else "") else tts
        if not per_voice or not per_voice.available:
            return
-        wav = await per_voice.synthesize(text)
+        wav = await per_voice.synthesize(speech_text)
        if not wav:
            return
        await ws.send_json({