From 13bb1c354bc33e46d725353f69b3f3d64226d65f Mon Sep 17 00:00:00 2001 From: marauder-actual Date: Fri, 29 May 2026 19:05:04 +0200 Subject: [PATCH] fix: strip markdown for TTS + render rich markdown in chat UI - _md_to_speech(): AST-based markdown stripping via markdown-it-py - Truncate TTS input at 450 chars on sentence boundary (chatterbox overflow) - chat.js: render assistant messages as markdown via marked.js + DOMPurify - Typewriter accumulates raw text, renders markdown progressively --- app/main.py | 56 ++++++++++++++++++++++++++++++++++++++++- app/static/chat.js | 23 +++++++++++++++-- app/templates/chat.html | 2 ++ 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/app/main.py b/app/main.py index 50a0193..853debe 100644 --- a/app/main.py +++ b/app/main.py @@ -939,17 +939,71 @@ async def _send_audio(ws: WebSocket, text: str) -> None: await _send_audio_with_voice(ws, text, tts.voice) +_TTS_MAX_CHARS = 450 # chatterbox s3-tokenizer overflows beyond ~500 chars + + +def _md_to_speech(text: str) -> str: + """Strip markdown to plain speech-ready text, capped for TTS safety. + + 1. Parse markdown AST — skip code blocks, horizontal rules. + 2. Extract plain text from inline nodes. + 3. Truncate at sentence boundary near _TTS_MAX_CHARS to avoid + chatterbox token overflow (garbled audio on long inputs). + """ + import re + from markdown_it import MarkdownIt + + md = MarkdownIt() + tokens = md.parse(text) + parts: list[str] = [] + for token in tokens: + if token.type in ("fence", "hr", "code_block"): + continue + if token.type in ("paragraph_close", "heading_close", "blockquote_close", "list_item_close"): + parts.append(" ") + continue + if token.children: + for child in token.children: + if child.type == "text": + parts.append(child.content) + elif child.type == "code_inline": + parts.append(child.content) + elif child.type == "softbreak": + parts.append(" ") + elif token.type == "inline" and token.content and not token.children: + parts.append(token.content) + clean = " ".join("".join(parts).split()).strip() + + # Truncate at sentence boundary if too long + if len(clean) <= _TTS_MAX_CHARS: + return clean + # Find last sentence-ending punctuation before the limit + truncated = clean[:_TTS_MAX_CHARS] + m = re.search(r"[.!?](?:\s|$)", truncated[::-1]) + if m: + cut = _TTS_MAX_CHARS - m.start() + return clean[:cut].strip() + # No sentence boundary — hard cut at last space + last_space = truncated.rfind(" ") + if last_space > 200: + return truncated[:last_space].strip() + return truncated.strip() + + async def _send_audio_with_voice(ws: WebSocket, text: str, voice_id: str) -> None: """Synthesize text in a specific voice and ship as audio. Used post-calibration.""" if not TTS_ENABLED: return import base64 try: + speech_text = _md_to_speech(text) if text else "" + if not speech_text: + return # spin up a per-voice synthesizer (cheap — just object init) per_voice = TTS(voice=voice_id) if voice_id != (tts.voice if tts else "") else tts if not per_voice or not per_voice.available: return - wav = await per_voice.synthesize(text) + wav = await per_voice.synthesize(speech_text) if not wav: return await ws.send_json({ diff --git a/app/static/chat.js b/app/static/chat.js index 7b25244..f6ac683 100644 --- a/app/static/chat.js +++ b/app/static/chat.js @@ -21,9 +21,22 @@ let ws = null; let connectAttempts = 0; let lastSpeaker = null; // 'user' | 'bt' | 'system' | null let currentBtBody = null; // active streaming .msg__body element +let currentBtRaw = ''; // raw markdown accumulator for streaming let queue = []; let draining = false; +// Configure marked for safe markdown rendering +if (typeof marked !== 'undefined') { + marked.setOptions({ breaks: true, gfm: true }); +} + +function renderMarkdown(raw) { + if (typeof marked === 'undefined') return raw; + const html = marked.parse(raw); + if (typeof DOMPurify !== 'undefined') return DOMPurify.sanitize(html); + return html; +} + // ---------- helpers ---------- function speakerLabel(role) { @@ -97,7 +110,9 @@ async function drain() { if (!currentBtBody) return; draining = true; while (queue.length) { - currentBtBody.textContent += queue.shift(); + currentBtRaw += queue.shift(); + // Re-render markdown on each char (marked.parse is fast enough) + currentBtBody.innerHTML = renderMarkdown(currentBtRaw); if (Math.random() < 0.04) scrollToBottom(); await sleep(TYPEWRITER_MS); } @@ -107,16 +122,19 @@ async function drain() { function sleep(ms) { return new Promise(r => setTimeout(r, ms)); } function finishBt() { - // wait for the queue to drain before adding caret + // wait for the queue to drain before final render const tick = () => { if (draining || queue.length) { setTimeout(tick, 30); return; } if (!currentBtBody) return; + // Final markdown render with complete text + currentBtBody.innerHTML = renderMarkdown(currentBtRaw); const caret = document.createElement('span'); caret.className = 'caret'; currentBtBody.appendChild(caret); scrollToBottom(); setTimeout(() => caret.remove(), 900); currentBtBody = null; + currentBtRaw = ''; }; tick(); } @@ -188,6 +206,7 @@ function handleMessage(msg) { if (!currentBtBody) { removeThinking(); currentBtBody = makeMsg('bt'); + currentBtRaw = ''; } if (msg.delta) enqueue(msg.delta); if (msg.done) finishBt(); diff --git a/app/templates/chat.html b/app/templates/chat.html index 8dbb0f8..1ac49b7 100644 --- a/app/templates/chat.html +++ b/app/templates/chat.html @@ -14,6 +14,8 @@ + +