feat: chatterbox TTS via madcat-tts daemon, Web Speech API STT, styled persona picker

- tts.py: replace piper subprocess with HTTP POST to madcat-tts /v1/audio/speech (chatterbox voice cloning) - chat.js: replace whisper server upload with browser Web Speech API (webkitSpeechRecognition) - chat.css: style persona picker — appearance:none select, themed with CSS vars, mobile responsive - main.py: default TTS voice → bt7274-en
2026-05-29 16:43:41 +02:00
parent f3c35eba72
commit ae384fe618
4 changed files with 165 additions and 131 deletions
@@ -242,6 +242,76 @@ body {
 .sigil:hover { opacity: 0.85; }
 .sigil img { width: 100%; height: 100%; display: block; }

+/* ---------- persona picker (inline in topnav) ---------- */
+.topnav__persona-wrap {
+  display: flex;
+  align-items: center;
+  gap: 0.45rem;
+}
+
+.topnav__persona-label {
+  font-family: var(--sans);
+  font-size: 0.58rem;
+  font-weight: 400;
+  letter-spacing: 0.16em;
+  text-transform: uppercase;
+  color: var(--ink-faint);
+  opacity: 0.45;
+  cursor: default;
+  user-select: none;
+}
+
+.topnav__persona-select {
+  appearance: none;
+  -webkit-appearance: none;
+  border: 1px solid var(--ink-faint);
+  border-radius: 3px;
+  background: var(--bg-soft);
+  color: var(--ink-muted);
+  font-family: var(--sans);
+  font-size: 0.62rem;
+  font-weight: 400;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  padding: 0.2rem 1.3rem 0.2rem 0.45rem;
+  cursor: pointer;
+  outline: none;
+  opacity: 0.55;
+  transition: opacity 400ms ease, color 400ms ease, border-color 400ms ease;
+
+  /* custom dropdown arrow */
+  background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='8' height='5' fill='none'%3E%3Cpath d='M1 1l3 3 3-3' stroke='%236a6a6f' stroke-width='1.2' stroke-linecap='round' stroke-linejoin='round'/%3E%3C/svg%3E");
+  background-repeat: no-repeat;
+  background-position: right 0.4rem center;
+}
+.topnav__persona-select:hover {
+  opacity: 1;
+  color: var(--ink);
+  border-color: var(--ink-muted);
+}
+.topnav__persona-select:focus-visible {
+  opacity: 1;
+  color: var(--ink);
+  border-color: var(--coral);
+}
+
+.topnav__persona-status {
+  font-family: var(--sans);
+  font-size: 0.58rem;
+  font-weight: 400;
+  letter-spacing: 0.12em;
+  color: var(--coral);
+  opacity: 0.7;
+  transition: opacity 400ms ease;
+  user-select: none;
+}
+.topnav__persona-status:empty { display: none; }
+
+@media (max-width: 600px) {
+  .topnav__persona-label { display: none; }
+  .topnav__persona-select { font-size: 0.58rem; padding: 0.15rem 1.1rem 0.15rem 0.35rem; }
+}
+
 /* ===================== CONVERSATION ===================== */
 .conversation {
  flex: 1 0 auto;
@@ -241,77 +241,87 @@ document.addEventListener('keydown', (e) => {
  }
 });

-// ---------- voice input (whisper) ----------
+// ---------- voice input (Web Speech API) ----------

 const $mic = document.getElementById('mic-button');
-let mediaRecorder = null;
-let recordedChunks = [];
+const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+let recognition = null;
 let recording = false;

-async function startRecording() {
+// Disable mic if browser lacks support
+if (!SpeechRecognition) {
+  console.warn('Web Speech API not supported — mic disabled');
+  $mic.disabled = true;
+  $mic.title = 'Speech recognition not supported in this browser';
+}
+
+function startRecording() {
+  if (!SpeechRecognition) return;
+  if (recognition) {
+    // stale instance — tear it down first
+    try { recognition.abort(); } catch {}
+    recognition = null;
+  }
+
+  recognition = new SpeechRecognition();
+  recognition.continuous = false;
+  recognition.interimResults = true;
+  recognition.lang = 'en-US';
+
+  recognition.addEventListener('result', (e) => {
+    // Build transcript from all results (interim + final)
+    let transcript = '';
+    for (let i = 0; i < e.results.length; i++) {
+      transcript += e.results[i][0].transcript;
+    }
+    $input.value = transcript;
+  });
+
+  recognition.addEventListener('end', () => {
+    recording = false;
+    $mic.classList.remove('recording');
+    recognition = null;
+
+    // Auto-send if we got text
+    const text = $input.value.trim();
+    if (text) {
+      $form.dispatchEvent(new Event('submit', { cancelable: true }));
+    } else {
+      flashMicEmpty();
+    }
+  });
+
+  recognition.addEventListener('error', (e) => {
+    recording = false;
+    $mic.classList.remove('recording');
+    recognition = null;
+
+    // 'no-speech' and 'aborted' are expected — not worth alarming the user
+    if (e.error === 'no-speech') {
+      flashMicEmpty();
+    } else if (e.error !== 'aborted') {
+      console.warn('speech recognition error:', e.error);
+      flashMicEmpty();
+    }
+  });
+
  try {
-    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-    recordedChunks = [];
-    // Prefer webm/opus if browser supports it (Chrome/FF). Safari may need fallback.
-    const mimeTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', ''];
-    const mime = mimeTypes.find(m => !m || MediaRecorder.isTypeSupported(m)) || '';
-    mediaRecorder = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined);
-    mediaRecorder.addEventListener('dataavailable', (e) => {
-      if (e.data && e.data.size > 0) recordedChunks.push(e.data);
-    });
-    mediaRecorder.addEventListener('stop', () => {
-      // release the mic immediately
-      stream.getTracks().forEach(t => t.stop());
-      handleRecorded();
-    });
-    mediaRecorder.start();
+    recognition.start();
    recording = true;
    $mic.classList.add('recording');
  } catch (err) {
-    console.warn('microphone unavailable:', err.message);
+    console.warn('speech recognition failed to start:', err.message);
    recording = false;
  }
 }

 function stopRecording() {
-  if (!mediaRecorder || mediaRecorder.state === 'inactive') return;
-  mediaRecorder.stop();
-  recording = false;
-  $mic.classList.remove('recording');
-  $mic.classList.add('transcribing');
-}
-
-async function handleRecorded() {
-  if (!recordedChunks.length) {
-    $mic.classList.remove('transcribing');
-    return;
-  }
-  const blob = new Blob(recordedChunks, { type: recordedChunks[0].type || 'audio/webm' });
-  const ext = (blob.type.split('/')[1] || 'webm').split(';')[0];
-  const form = new FormData();
-  form.append('audio', blob, `speech.${ext}`);
-
-  try {
-    const resp = await fetch('/api/transcribe', { method: 'POST', body: form });
-    if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
-    const { text } = await resp.json();
-    if (text && text.trim()) {
-      $input.value = text;
-      // auto-send — Her vibe is "speak and she hears"
-      $form.dispatchEvent(new Event('submit', { cancelable: true }));
-    } else {
-      flashMicEmpty();
-    }
-  } catch (err) {
-    console.warn('transcription failed:', err.message);
-    flashMicEmpty();
-  } finally {
-    $mic.classList.remove('transcribing');
-  }
+  if (!recognition) return;
+  try { recognition.stop(); } catch {}
+  // 'end' event handler cleans up recording state and sends
 }

 function flashMicEmpty() {
-  // brief visual hint that nothing was heard — no toast, no popup, just a flash
  $mic.classList.add('empty');
  setTimeout(() => $mic.classList.remove('empty'), 700);
 }