feat: chatterbox TTS via madcat-tts daemon, Web Speech API STT, styled persona picker

- tts.py: replace piper subprocess with HTTP POST to madcat-tts /v1/audio/speech (chatterbox voice cloning) - chat.js: replace whisper server upload with browser Web Speech API (webkitSpeechRecognition) - chat.css: style persona picker — appearance:none select, themed with CSS vars, mobile responsive - main.py: default TTS voice → bt7274-en
2026-05-29 16:43:41 +02:00
parent f3c35eba72
commit ae384fe618
4 changed files with 165 additions and 131 deletions
@@ -403,7 +403,7 @@ else:
 # --- TTS / STT ---
 TTS_ENABLED = os.environ.get("TTS_ENABLED", "true").lower() != "false"
-TTS_VOICE = os.environ.get("TTS_VOICE", "en_US-amy-medium")
+TTS_VOICE = os.environ.get("TTS_VOICE", "bt7274-en")
 tts = TTS(voice=TTS_VOICE) if TTS_ENABLED else None
 STT_ENABLED = os.environ.get("STT_ENABLED", "true").lower() != "false"
@@ -242,6 +242,76 @@ body {
 .sigil:hover { opacity: 0.85; }
 .sigil img { width: 100%; height: 100%; display: block; }
 /* ---------- persona picker (inline in topnav) ---------- */
 .topnav__persona-wrap {
  display: flex;
  align-items: center;
  gap: 0.45rem;
 }
 .topnav__persona-label {
  font-family: var(--sans);
  font-size: 0.58rem;
  font-weight: 400;
  letter-spacing: 0.16em;
  text-transform: uppercase;
  color: var(--ink-faint);
  opacity: 0.45;
  cursor: default;
  user-select: none;
 }
 .topnav__persona-select {
  appearance: none;
  -webkit-appearance: none;
  border: 1px solid var(--ink-faint);
  border-radius: 3px;
  background: var(--bg-soft);
  color: var(--ink-muted);
  font-family: var(--sans);
  font-size: 0.62rem;
  font-weight: 400;
  letter-spacing: 0.08em;
  text-transform: uppercase;
  padding: 0.2rem 1.3rem 0.2rem 0.45rem;
  cursor: pointer;
  outline: none;
  opacity: 0.55;
  transition: opacity 400ms ease, color 400ms ease, border-color 400ms ease;
  /* custom dropdown arrow */
  background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='8' height='5' fill='none'%3E%3Cpath d='M1 1l3 3 3-3' stroke='%236a6a6f' stroke-width='1.2' stroke-linecap='round' stroke-linejoin='round'/%3E%3C/svg%3E");
  background-repeat: no-repeat;
  background-position: right 0.4rem center;
 }
 .topnav__persona-select:hover {
  opacity: 1;
  color: var(--ink);
  border-color: var(--ink-muted);
 }
 .topnav__persona-select:focus-visible {
  opacity: 1;
  color: var(--ink);
  border-color: var(--coral);
 }
 .topnav__persona-status {
  font-family: var(--sans);
  font-size: 0.58rem;
  font-weight: 400;
  letter-spacing: 0.12em;
  color: var(--coral);
  opacity: 0.7;
  transition: opacity 400ms ease;
  user-select: none;
 }
 .topnav__persona-status:empty { display: none; }
@media (max-width: 600px) {
  .topnav__persona-label { display: none; }
  .topnav__persona-select { font-size: 0.58rem; padding: 0.15rem 1.1rem 0.15rem 0.35rem; }
 }
 /* ===================== CONVERSATION ===================== */
 .conversation {
  flex: 1 0 auto;
@@ -241,77 +241,87 @@ document.addEventListener('keydown', (e) => {
  }
 });
-// ---------- voice input (whisper) ----------
+// ---------- voice input (Web Speech API) ----------
 const $mic = document.getElementById('mic-button');
-let mediaRecorder = null;
+const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
-let recordedChunks = [];
+let recognition = null;
 let recording = false;
-async function startRecording() {
+// Disable mic if browser lacks support
 if (!SpeechRecognition) {
  console.warn('Web Speech API not supported — mic disabled');
  $mic.disabled = true;
  $mic.title = 'Speech recognition not supported in this browser';
 }
 function startRecording() {
  if (!SpeechRecognition) return;
  if (recognition) {
    // stale instance — tear it down first
    try { recognition.abort(); } catch {}
    recognition = null;
  }
  recognition = new SpeechRecognition();
  recognition.continuous = false;
  recognition.interimResults = true;
  recognition.lang = 'en-US';
  recognition.addEventListener('result', (e) => {
    // Build transcript from all results (interim + final)
    let transcript = '';
    for (let i = 0; i < e.results.length; i++) {
      transcript += e.results[i][0].transcript;
    }
    $input.value = transcript;
  });
  recognition.addEventListener('end', () => {
    recording = false;
    $mic.classList.remove('recording');
    recognition = null;
    // Auto-send if we got text
    const text = $input.value.trim();
    if (text) {
      $form.dispatchEvent(new Event('submit', { cancelable: true }));
    } else {
      flashMicEmpty();
    }
  });
  recognition.addEventListener('error', (e) => {
    recording = false;
    $mic.classList.remove('recording');
    recognition = null;
    // 'no-speech' and 'aborted' are expected — not worth alarming the user
    if (e.error === 'no-speech') {
      flashMicEmpty();
    } else if (e.error !== 'aborted') {
      console.warn('speech recognition error:', e.error);
      flashMicEmpty();
    }
  });
  try {
-    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    recognition.start();
    recordedChunks = [];
    // Prefer webm/opus if browser supports it (Chrome/FF). Safari may need fallback.
    const mimeTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', ''];
    const mime = mimeTypes.find(m => !m || MediaRecorder.isTypeSupported(m)) || '';
    mediaRecorder = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined);
    mediaRecorder.addEventListener('dataavailable', (e) => {
      if (e.data && e.data.size > 0) recordedChunks.push(e.data);
    });
    mediaRecorder.addEventListener('stop', () => {
      // release the mic immediately
      stream.getTracks().forEach(t => t.stop());
      handleRecorded();
    });
    mediaRecorder.start();
    recording = true;
    $mic.classList.add('recording');
  } catch (err) {
-    console.warn('microphone unavailable:', err.message);
+    console.warn('speech recognition failed to start:', err.message);
    recording = false;
  }
 }
 function stopRecording() {
-  if (!mediaRecorder || mediaRecorder.state === 'inactive') return;
+  if (!recognition) return;
-  mediaRecorder.stop();
+  try { recognition.stop(); } catch {}
-  recording = false;
+  // 'end' event handler cleans up recording state and sends
  $mic.classList.remove('recording');
  $mic.classList.add('transcribing');
 }
 async function handleRecorded() {
  if (!recordedChunks.length) {
    $mic.classList.remove('transcribing');
    return;
  }
  const blob = new Blob(recordedChunks, { type: recordedChunks[0].type || 'audio/webm' });
  const ext = (blob.type.split('/')[1] || 'webm').split(';')[0];
  const form = new FormData();
  form.append('audio', blob, `speech.${ext}`);
  try {
    const resp = await fetch('/api/transcribe', { method: 'POST', body: form });
    if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
    const { text } = await resp.json();
    if (text && text.trim()) {
      $input.value = text;
      // auto-send — Her vibe is "speak and she hears"
      $form.dispatchEvent(new Event('submit', { cancelable: true }));
    } else {
      flashMicEmpty();
    }
  } catch (err) {
    console.warn('transcription failed:', err.message);
    flashMicEmpty();
  } finally {
    $mic.classList.remove('transcribing');
  }
 }
 function flashMicEmpty() {
  // brief visual hint that nothing was heard — no toast, no popup, just a flash
  $mic.classList.add('empty');
  setTimeout(() => $mic.classList.remove('empty'), 700);
 }
@@ -1,101 +1,55 @@
-"""Piper TTS adapter for chat.saiden.dev.
+"""HTTP TTS adapter for chat.saiden.dev — madcat-tts daemon (chatterbox).
-Synthesises text → WAV bytes by subprocess'ing the `piper` CLI binary
+Calls the madcat-tts daemon's OpenAI-compatible /v1/audio/speech endpoint
-(already installed on every host that runs marauder-os).
+to synthesize text → WAV bytes via chatterbox voice cloning.
-Designed to fail silently — if piper is missing or synthesis errors,
+Designed to fail silently — if the daemon is down or synthesis errors,
 the chat still works, just without voice.
 """
 from __future__ import annotations
 import asyncio
 import logging
 import os
-import shutil
+
-import tempfile
+import httpx
 from pathlib import Path
 log = logging.getLogger("chat-saiden.tts")
-# Where the voice .onnx files live across hosts.
+MADCAT_TTS_URL = os.environ.get("MADCAT_TTS_URL", "http://localhost:14099")
 # Order: env override → macOS marauder → linux marauder → linux marauder-agent (mesh node) → linux ~/.local
 _VOICE_SEARCH_PATHS = [
    Path.home() / "Library/Application Support/marauder/voices",
    Path("/home") / os.environ.get("USER", "marauder") / ".local/share/marauder/voices",
    Path.home() / ".local/share/marauder/voices",
    Path.home() / ".local/share/psn/voices",
    Path.home() / ".local/share/piper/voices",
 ]
 def _resolve_voice_path(name: str) -> Path | None:
    """Return absolute path to a voice model by short name, or None."""
    # explicit override
    override = os.environ.get("TTS_VOICE_PATH")
    if override:
        p = Path(override)
        return p if p.exists() else None
    for base in _VOICE_SEARCH_PATHS:
        candidate = base / f"{name}.onnx"
        if candidate.exists():
            return candidate
    return None
 PIPER_BIN = shutil.which("piper") or os.environ.get("PIPER_BIN")
 class TTS:
-    """Subprocess-based piper synthesizer with graceful fallback."""
+    """HTTP-based madcat-tts synthesizer with graceful fallback."""
-    def __init__(self, voice: str = "en_US-amy-medium") -> None:
+    def __init__(self, voice: str = "bt7274-en") -> None:
        self.voice = voice
-        self.voice_path = _resolve_voice_path(voice)
+        self._url = f"{MADCAT_TTS_URL.rstrip('/')}/v1/audio/speech"
-        self.bin = PIPER_BIN
+        log.info("TTS enabled — voice=%s url=%s", voice, self._url)
        if not self.bin:
            log.warning("piper binary not found on PATH — TTS disabled")
        elif not self.voice_path:
            log.warning("voice '%s' not found in known locations — TTS disabled", voice)
        else:
            log.info("TTS enabled — voice=%s path=%s bin=%s", voice, self.voice_path, self.bin)
    @property
    def available(self) -> bool:
-        return bool(self.bin and self.voice_path)
+        return True
    async def synthesize(self, text: str) -> bytes | None:
        """Return WAV bytes, or None on failure / unavailable."""
-        if not self.available:
+        if not text or not text.strip():
            return None
        if not text.strip():
            return None
        # piper wants an output file path (no stdout streaming for WAV in older versions)
        out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        out.close()
        out_path = out.name
        try:
-            proc = await asyncio.create_subprocess_exec(
+            async with httpx.AsyncClient(timeout=15.0) as client:
-                self.bin,
+                resp = await client.post(
-                "--model", str(self.voice_path),
+                    self._url,
-                "--output_file", out_path,
+                    json={
-                stdin=asyncio.subprocess.PIPE,
+                        "input": text,
-                stdout=asyncio.subprocess.DEVNULL,
+                        "voice": self.voice,
-                stderr=asyncio.subprocess.PIPE,
+                        "response_format": "wav",
-            )
+                    },
-            _, stderr = await proc.communicate(text.encode("utf-8"))
+                )
-            if proc.returncode != 0:
+                resp.raise_for_status()
-                log.error("piper exited %s: %s", proc.returncode, stderr.decode("utf-8", "replace")[:300])
+                return resp.content
-                return None
+        except httpx.TimeoutException:
-            with open(out_path, "rb") as f:
+            log.warning("TTS timeout for voice=%s (text=%s…)", self.voice, text[:60])
-                return f.read()
+            return None
-        except Exception:
+        except Exception:
-            log.exception("piper synthesis failed")
+            log.exception("TTS synthesis failed for voice=%s", self.voice)
            return None
        finally:
            try:
                os.unlink(out_path)
            except OSError:
                pass