diff --git a/app/main.py b/app/main.py
index 50a0193..853debe 100644
--- a/app/main.py
+++ b/app/main.py
@@ -939,17 +939,71 @@ async def _send_audio(ws: WebSocket, text: str) -> None:
await _send_audio_with_voice(ws, text, tts.voice)
+_TTS_MAX_CHARS = 450 # chatterbox s3-tokenizer overflows beyond ~500 chars
+
+
+def _md_to_speech(text: str) -> str:
+ """Strip markdown to plain speech-ready text, capped for TTS safety.
+
+ 1. Parse markdown AST — skip code blocks, horizontal rules.
+ 2. Extract plain text from inline nodes.
+ 3. Truncate at sentence boundary near _TTS_MAX_CHARS to avoid
+ chatterbox token overflow (garbled audio on long inputs).
+ """
+ import re
+ from markdown_it import MarkdownIt
+
+ md = MarkdownIt()
+ tokens = md.parse(text)
+ parts: list[str] = []
+ for token in tokens:
+ if token.type in ("fence", "hr", "code_block"):
+ continue
+ if token.type in ("paragraph_close", "heading_close", "blockquote_close", "list_item_close"):
+ parts.append(" ")
+ continue
+ if token.children:
+ for child in token.children:
+ if child.type == "text":
+ parts.append(child.content)
+ elif child.type == "code_inline":
+ parts.append(child.content)
+ elif child.type == "softbreak":
+ parts.append(" ")
+ elif token.type == "inline" and token.content and not token.children:
+ parts.append(token.content)
+ clean = " ".join("".join(parts).split()).strip()
+
+ # Truncate at sentence boundary if too long
+ if len(clean) <= _TTS_MAX_CHARS:
+ return clean
+ # Find last sentence-ending punctuation before the limit
+ truncated = clean[:_TTS_MAX_CHARS]
+ m = re.search(r"[.!?](?:\s|$)", truncated[::-1])
+ if m:
+ cut = _TTS_MAX_CHARS - m.start()
+ return clean[:cut].strip()
+ # No sentence boundary — hard cut at last space
+ last_space = truncated.rfind(" ")
+ if last_space > 200:
+ return truncated[:last_space].strip()
+ return truncated.strip()
+
+
async def _send_audio_with_voice(ws: WebSocket, text: str, voice_id: str) -> None:
"""Synthesize text in a specific voice and ship as audio. Used post-calibration."""
if not TTS_ENABLED:
return
import base64
try:
+ speech_text = _md_to_speech(text) if text else ""
+ if not speech_text:
+ return
# spin up a per-voice synthesizer (cheap — just object init)
per_voice = TTS(voice=voice_id) if voice_id != (tts.voice if tts else "") else tts
if not per_voice or not per_voice.available:
return
- wav = await per_voice.synthesize(text)
+ wav = await per_voice.synthesize(speech_text)
if not wav:
return
await ws.send_json({
diff --git a/app/static/chat.js b/app/static/chat.js
index 7b25244..f6ac683 100644
--- a/app/static/chat.js
+++ b/app/static/chat.js
@@ -21,9 +21,22 @@ let ws = null;
let connectAttempts = 0;
let lastSpeaker = null; // 'user' | 'bt' | 'system' | null
let currentBtBody = null; // active streaming .msg__body element
+let currentBtRaw = ''; // raw markdown accumulator for streaming
let queue = [];
let draining = false;
+// Configure marked for safe markdown rendering
+if (typeof marked !== 'undefined') {
+ marked.setOptions({ breaks: true, gfm: true });
+}
+
+function renderMarkdown(raw) {
+ if (typeof marked === 'undefined') return raw;
+ const html = marked.parse(raw);
+ if (typeof DOMPurify !== 'undefined') return DOMPurify.sanitize(html);
+ return html;
+}
+
// ---------- helpers ----------
function speakerLabel(role) {
@@ -97,7 +110,9 @@ async function drain() {
if (!currentBtBody) return;
draining = true;
while (queue.length) {
- currentBtBody.textContent += queue.shift();
+ currentBtRaw += queue.shift();
+ // Re-render markdown on each char (marked.parse is fast enough)
+ currentBtBody.innerHTML = renderMarkdown(currentBtRaw);
if (Math.random() < 0.04) scrollToBottom();
await sleep(TYPEWRITER_MS);
}
@@ -107,16 +122,19 @@ async function drain() {
function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
function finishBt() {
- // wait for the queue to drain before adding caret
+ // wait for the queue to drain before final render
const tick = () => {
if (draining || queue.length) { setTimeout(tick, 30); return; }
if (!currentBtBody) return;
+ // Final markdown render with complete text
+ currentBtBody.innerHTML = renderMarkdown(currentBtRaw);
const caret = document.createElement('span');
caret.className = 'caret';
currentBtBody.appendChild(caret);
scrollToBottom();
setTimeout(() => caret.remove(), 900);
currentBtBody = null;
+ currentBtRaw = '';
};
tick();
}
@@ -188,6 +206,7 @@ function handleMessage(msg) {
if (!currentBtBody) {
removeThinking();
currentBtBody = makeMsg('bt');
+ currentBtRaw = '';
}
if (msg.delta) enqueue(msg.delta);
if (msg.done) finishBt();
diff --git a/app/templates/chat.html b/app/templates/chat.html
index 8dbb0f8..1ac49b7 100644
--- a/app/templates/chat.html
+++ b/app/templates/chat.html
@@ -14,6 +14,8 @@
+
+