diff --git a/app/main.py b/app/main.py index b12408a..d9abc2e 100644 --- a/app/main.py +++ b/app/main.py @@ -403,7 +403,7 @@ else: # --- TTS / STT --- TTS_ENABLED = os.environ.get("TTS_ENABLED", "true").lower() != "false" -TTS_VOICE = os.environ.get("TTS_VOICE", "en_US-amy-medium") +TTS_VOICE = os.environ.get("TTS_VOICE", "bt7274-en") tts = TTS(voice=TTS_VOICE) if TTS_ENABLED else None STT_ENABLED = os.environ.get("STT_ENABLED", "true").lower() != "false" diff --git a/app/static/chat.css b/app/static/chat.css index 7fafa60..8d93984 100644 --- a/app/static/chat.css +++ b/app/static/chat.css @@ -242,6 +242,76 @@ body { .sigil:hover { opacity: 0.85; } .sigil img { width: 100%; height: 100%; display: block; } +/* ---------- persona picker (inline in topnav) ---------- */ +.topnav__persona-wrap { + display: flex; + align-items: center; + gap: 0.45rem; +} + +.topnav__persona-label { + font-family: var(--sans); + font-size: 0.58rem; + font-weight: 400; + letter-spacing: 0.16em; + text-transform: uppercase; + color: var(--ink-faint); + opacity: 0.45; + cursor: default; + user-select: none; +} + +.topnav__persona-select { + appearance: none; + -webkit-appearance: none; + border: 1px solid var(--ink-faint); + border-radius: 3px; + background: var(--bg-soft); + color: var(--ink-muted); + font-family: var(--sans); + font-size: 0.62rem; + font-weight: 400; + letter-spacing: 0.08em; + text-transform: uppercase; + padding: 0.2rem 1.3rem 0.2rem 0.45rem; + cursor: pointer; + outline: none; + opacity: 0.55; + transition: opacity 400ms ease, color 400ms ease, border-color 400ms ease; + + /* custom dropdown arrow */ + background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='8' height='5' fill='none'%3E%3Cpath d='M1 1l3 3 3-3' stroke='%236a6a6f' stroke-width='1.2' stroke-linecap='round' stroke-linejoin='round'/%3E%3C/svg%3E"); + background-repeat: no-repeat; + background-position: right 0.4rem center; +} +.topnav__persona-select:hover { + opacity: 1; + color: var(--ink); + border-color: var(--ink-muted); +} +.topnav__persona-select:focus-visible { + opacity: 1; + color: var(--ink); + border-color: var(--coral); +} + +.topnav__persona-status { + font-family: var(--sans); + font-size: 0.58rem; + font-weight: 400; + letter-spacing: 0.12em; + color: var(--coral); + opacity: 0.7; + transition: opacity 400ms ease; + user-select: none; +} +.topnav__persona-status:empty { display: none; } + +@media (max-width: 600px) { + .topnav__persona-label { display: none; } + .topnav__persona-select { font-size: 0.58rem; padding: 0.15rem 1.1rem 0.15rem 0.35rem; } +} + /* ===================== CONVERSATION ===================== */ .conversation { flex: 1 0 auto; diff --git a/app/static/chat.js b/app/static/chat.js index bb580a9..7b25244 100644 --- a/app/static/chat.js +++ b/app/static/chat.js @@ -241,77 +241,87 @@ document.addEventListener('keydown', (e) => { } }); -// ---------- voice input (whisper) ---------- +// ---------- voice input (Web Speech API) ---------- const $mic = document.getElementById('mic-button'); -let mediaRecorder = null; -let recordedChunks = []; +const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; +let recognition = null; let recording = false; -async function startRecording() { +// Disable mic if browser lacks support +if (!SpeechRecognition) { + console.warn('Web Speech API not supported — mic disabled'); + $mic.disabled = true; + $mic.title = 'Speech recognition not supported in this browser'; +} + +function startRecording() { + if (!SpeechRecognition) return; + if (recognition) { + // stale instance — tear it down first + try { recognition.abort(); } catch {} + recognition = null; + } + + recognition = new SpeechRecognition(); + recognition.continuous = false; + recognition.interimResults = true; + recognition.lang = 'en-US'; + + recognition.addEventListener('result', (e) => { + // Build transcript from all results (interim + final) + let transcript = ''; + for (let i = 0; i < e.results.length; i++) { + transcript += e.results[i][0].transcript; + } + $input.value = transcript; + }); + + recognition.addEventListener('end', () => { + recording = false; + $mic.classList.remove('recording'); + recognition = null; + + // Auto-send if we got text + const text = $input.value.trim(); + if (text) { + $form.dispatchEvent(new Event('submit', { cancelable: true })); + } else { + flashMicEmpty(); + } + }); + + recognition.addEventListener('error', (e) => { + recording = false; + $mic.classList.remove('recording'); + recognition = null; + + // 'no-speech' and 'aborted' are expected — not worth alarming the user + if (e.error === 'no-speech') { + flashMicEmpty(); + } else if (e.error !== 'aborted') { + console.warn('speech recognition error:', e.error); + flashMicEmpty(); + } + }); + try { - const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); - recordedChunks = []; - // Prefer webm/opus if browser supports it (Chrome/FF). Safari may need fallback. - const mimeTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', '']; - const mime = mimeTypes.find(m => !m || MediaRecorder.isTypeSupported(m)) || ''; - mediaRecorder = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined); - mediaRecorder.addEventListener('dataavailable', (e) => { - if (e.data && e.data.size > 0) recordedChunks.push(e.data); - }); - mediaRecorder.addEventListener('stop', () => { - // release the mic immediately - stream.getTracks().forEach(t => t.stop()); - handleRecorded(); - }); - mediaRecorder.start(); + recognition.start(); recording = true; $mic.classList.add('recording'); } catch (err) { - console.warn('microphone unavailable:', err.message); + console.warn('speech recognition failed to start:', err.message); recording = false; } } function stopRecording() { - if (!mediaRecorder || mediaRecorder.state === 'inactive') return; - mediaRecorder.stop(); - recording = false; - $mic.classList.remove('recording'); - $mic.classList.add('transcribing'); -} - -async function handleRecorded() { - if (!recordedChunks.length) { - $mic.classList.remove('transcribing'); - return; - } - const blob = new Blob(recordedChunks, { type: recordedChunks[0].type || 'audio/webm' }); - const ext = (blob.type.split('/')[1] || 'webm').split(';')[0]; - const form = new FormData(); - form.append('audio', blob, `speech.${ext}`); - - try { - const resp = await fetch('/api/transcribe', { method: 'POST', body: form }); - if (!resp.ok) throw new Error(`HTTP ${resp.status}`); - const { text } = await resp.json(); - if (text && text.trim()) { - $input.value = text; - // auto-send — Her vibe is "speak and she hears" - $form.dispatchEvent(new Event('submit', { cancelable: true })); - } else { - flashMicEmpty(); - } - } catch (err) { - console.warn('transcription failed:', err.message); - flashMicEmpty(); - } finally { - $mic.classList.remove('transcribing'); - } + if (!recognition) return; + try { recognition.stop(); } catch {} + // 'end' event handler cleans up recording state and sends } function flashMicEmpty() { - // brief visual hint that nothing was heard — no toast, no popup, just a flash $mic.classList.add('empty'); setTimeout(() => $mic.classList.remove('empty'), 700); } diff --git a/app/tts.py b/app/tts.py index 9312154..29f6efb 100644 --- a/app/tts.py +++ b/app/tts.py @@ -1,101 +1,55 @@ -"""Piper TTS adapter for chat.saiden.dev. +"""HTTP TTS adapter for chat.saiden.dev — madcat-tts daemon (chatterbox). -Synthesises text → WAV bytes by subprocess'ing the `piper` CLI binary -(already installed on every host that runs marauder-os). +Calls the madcat-tts daemon's OpenAI-compatible /v1/audio/speech endpoint +to synthesize text → WAV bytes via chatterbox voice cloning. -Designed to fail silently — if piper is missing or synthesis errors, +Designed to fail silently — if the daemon is down or synthesis errors, the chat still works, just without voice. """ from __future__ import annotations -import asyncio import logging import os -import shutil -import tempfile -from pathlib import Path + +import httpx log = logging.getLogger("chat-saiden.tts") -# Where the voice .onnx files live across hosts. -# Order: env override → macOS marauder → linux marauder → linux marauder-agent (mesh node) → linux ~/.local -_VOICE_SEARCH_PATHS = [ - Path.home() / "Library/Application Support/marauder/voices", - Path("/home") / os.environ.get("USER", "marauder") / ".local/share/marauder/voices", - Path.home() / ".local/share/marauder/voices", - Path.home() / ".local/share/psn/voices", - Path.home() / ".local/share/piper/voices", -] - - -def _resolve_voice_path(name: str) -> Path | None: - """Return absolute path to a voice model by short name, or None.""" - # explicit override - override = os.environ.get("TTS_VOICE_PATH") - if override: - p = Path(override) - return p if p.exists() else None - - for base in _VOICE_SEARCH_PATHS: - candidate = base / f"{name}.onnx" - if candidate.exists(): - return candidate - return None - - -PIPER_BIN = shutil.which("piper") or os.environ.get("PIPER_BIN") +MADCAT_TTS_URL = os.environ.get("MADCAT_TTS_URL", "http://localhost:14099") class TTS: - """Subprocess-based piper synthesizer with graceful fallback.""" + """HTTP-based madcat-tts synthesizer with graceful fallback.""" - def __init__(self, voice: str = "en_US-amy-medium") -> None: + def __init__(self, voice: str = "bt7274-en") -> None: self.voice = voice - self.voice_path = _resolve_voice_path(voice) - self.bin = PIPER_BIN - if not self.bin: - log.warning("piper binary not found on PATH — TTS disabled") - elif not self.voice_path: - log.warning("voice '%s' not found in known locations — TTS disabled", voice) - else: - log.info("TTS enabled — voice=%s path=%s bin=%s", voice, self.voice_path, self.bin) + self._url = f"{MADCAT_TTS_URL.rstrip('/')}/v1/audio/speech" + log.info("TTS enabled — voice=%s url=%s", voice, self._url) @property def available(self) -> bool: - return bool(self.bin and self.voice_path) + return True async def synthesize(self, text: str) -> bytes | None: """Return WAV bytes, or None on failure / unavailable.""" - if not self.available: + if not text or not text.strip(): return None - if not text.strip(): - return None - - # piper wants an output file path (no stdout streaming for WAV in older versions) - out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) - out.close() - out_path = out.name try: - proc = await asyncio.create_subprocess_exec( - self.bin, - "--model", str(self.voice_path), - "--output_file", out_path, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.DEVNULL, - stderr=asyncio.subprocess.PIPE, - ) - _, stderr = await proc.communicate(text.encode("utf-8")) - if proc.returncode != 0: - log.error("piper exited %s: %s", proc.returncode, stderr.decode("utf-8", "replace")[:300]) - return None - with open(out_path, "rb") as f: - return f.read() - except Exception: - log.exception("piper synthesis failed") + async with httpx.AsyncClient(timeout=15.0) as client: + resp = await client.post( + self._url, + json={ + "input": text, + "voice": self.voice, + "response_format": "wav", + }, + ) + resp.raise_for_status() + return resp.content + except httpx.TimeoutException: + log.warning("TTS timeout for voice=%s (text=%s…)", self.voice, text[:60]) + return None + except Exception: + log.exception("TTS synthesis failed for voice=%s", self.voice) return None - finally: - try: - os.unlink(out_path) - except OSError: - pass