feat: chatterbox TTS via madcat-tts daemon, Web Speech API STT, styled persona picker
- tts.py: replace piper subprocess with HTTP POST to madcat-tts /v1/audio/speech (chatterbox voice cloning) - chat.js: replace whisper server upload with browser Web Speech API (webkitSpeechRecognition) - chat.css: style persona picker — appearance:none select, themed with CSS vars, mobile responsive - main.py: default TTS voice → bt7274-en
This commit is contained in:
+1
-1
@@ -403,7 +403,7 @@ else:
|
|||||||
|
|
||||||
# --- TTS / STT ---
|
# --- TTS / STT ---
|
||||||
TTS_ENABLED = os.environ.get("TTS_ENABLED", "true").lower() != "false"
|
TTS_ENABLED = os.environ.get("TTS_ENABLED", "true").lower() != "false"
|
||||||
TTS_VOICE = os.environ.get("TTS_VOICE", "en_US-amy-medium")
|
TTS_VOICE = os.environ.get("TTS_VOICE", "bt7274-en")
|
||||||
tts = TTS(voice=TTS_VOICE) if TTS_ENABLED else None
|
tts = TTS(voice=TTS_VOICE) if TTS_ENABLED else None
|
||||||
|
|
||||||
STT_ENABLED = os.environ.get("STT_ENABLED", "true").lower() != "false"
|
STT_ENABLED = os.environ.get("STT_ENABLED", "true").lower() != "false"
|
||||||
|
|||||||
@@ -242,6 +242,76 @@ body {
|
|||||||
.sigil:hover { opacity: 0.85; }
|
.sigil:hover { opacity: 0.85; }
|
||||||
.sigil img { width: 100%; height: 100%; display: block; }
|
.sigil img { width: 100%; height: 100%; display: block; }
|
||||||
|
|
||||||
|
/* ---------- persona picker (inline in topnav) ---------- */
|
||||||
|
.topnav__persona-wrap {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.45rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.topnav__persona-label {
|
||||||
|
font-family: var(--sans);
|
||||||
|
font-size: 0.58rem;
|
||||||
|
font-weight: 400;
|
||||||
|
letter-spacing: 0.16em;
|
||||||
|
text-transform: uppercase;
|
||||||
|
color: var(--ink-faint);
|
||||||
|
opacity: 0.45;
|
||||||
|
cursor: default;
|
||||||
|
user-select: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.topnav__persona-select {
|
||||||
|
appearance: none;
|
||||||
|
-webkit-appearance: none;
|
||||||
|
border: 1px solid var(--ink-faint);
|
||||||
|
border-radius: 3px;
|
||||||
|
background: var(--bg-soft);
|
||||||
|
color: var(--ink-muted);
|
||||||
|
font-family: var(--sans);
|
||||||
|
font-size: 0.62rem;
|
||||||
|
font-weight: 400;
|
||||||
|
letter-spacing: 0.08em;
|
||||||
|
text-transform: uppercase;
|
||||||
|
padding: 0.2rem 1.3rem 0.2rem 0.45rem;
|
||||||
|
cursor: pointer;
|
||||||
|
outline: none;
|
||||||
|
opacity: 0.55;
|
||||||
|
transition: opacity 400ms ease, color 400ms ease, border-color 400ms ease;
|
||||||
|
|
||||||
|
/* custom dropdown arrow */
|
||||||
|
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='8' height='5' fill='none'%3E%3Cpath d='M1 1l3 3 3-3' stroke='%236a6a6f' stroke-width='1.2' stroke-linecap='round' stroke-linejoin='round'/%3E%3C/svg%3E");
|
||||||
|
background-repeat: no-repeat;
|
||||||
|
background-position: right 0.4rem center;
|
||||||
|
}
|
||||||
|
.topnav__persona-select:hover {
|
||||||
|
opacity: 1;
|
||||||
|
color: var(--ink);
|
||||||
|
border-color: var(--ink-muted);
|
||||||
|
}
|
||||||
|
.topnav__persona-select:focus-visible {
|
||||||
|
opacity: 1;
|
||||||
|
color: var(--ink);
|
||||||
|
border-color: var(--coral);
|
||||||
|
}
|
||||||
|
|
||||||
|
.topnav__persona-status {
|
||||||
|
font-family: var(--sans);
|
||||||
|
font-size: 0.58rem;
|
||||||
|
font-weight: 400;
|
||||||
|
letter-spacing: 0.12em;
|
||||||
|
color: var(--coral);
|
||||||
|
opacity: 0.7;
|
||||||
|
transition: opacity 400ms ease;
|
||||||
|
user-select: none;
|
||||||
|
}
|
||||||
|
.topnav__persona-status:empty { display: none; }
|
||||||
|
|
||||||
|
@media (max-width: 600px) {
|
||||||
|
.topnav__persona-label { display: none; }
|
||||||
|
.topnav__persona-select { font-size: 0.58rem; padding: 0.15rem 1.1rem 0.15rem 0.35rem; }
|
||||||
|
}
|
||||||
|
|
||||||
/* ===================== CONVERSATION ===================== */
|
/* ===================== CONVERSATION ===================== */
|
||||||
.conversation {
|
.conversation {
|
||||||
flex: 1 0 auto;
|
flex: 1 0 auto;
|
||||||
|
|||||||
+65
-55
@@ -241,77 +241,87 @@ document.addEventListener('keydown', (e) => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// ---------- voice input (whisper) ----------
|
// ---------- voice input (Web Speech API) ----------
|
||||||
|
|
||||||
const $mic = document.getElementById('mic-button');
|
const $mic = document.getElementById('mic-button');
|
||||||
let mediaRecorder = null;
|
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
||||||
let recordedChunks = [];
|
let recognition = null;
|
||||||
let recording = false;
|
let recording = false;
|
||||||
|
|
||||||
async function startRecording() {
|
// Disable mic if browser lacks support
|
||||||
|
if (!SpeechRecognition) {
|
||||||
|
console.warn('Web Speech API not supported — mic disabled');
|
||||||
|
$mic.disabled = true;
|
||||||
|
$mic.title = 'Speech recognition not supported in this browser';
|
||||||
|
}
|
||||||
|
|
||||||
|
function startRecording() {
|
||||||
|
if (!SpeechRecognition) return;
|
||||||
|
if (recognition) {
|
||||||
|
// stale instance — tear it down first
|
||||||
|
try { recognition.abort(); } catch {}
|
||||||
|
recognition = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
recognition = new SpeechRecognition();
|
||||||
|
recognition.continuous = false;
|
||||||
|
recognition.interimResults = true;
|
||||||
|
recognition.lang = 'en-US';
|
||||||
|
|
||||||
|
recognition.addEventListener('result', (e) => {
|
||||||
|
// Build transcript from all results (interim + final)
|
||||||
|
let transcript = '';
|
||||||
|
for (let i = 0; i < e.results.length; i++) {
|
||||||
|
transcript += e.results[i][0].transcript;
|
||||||
|
}
|
||||||
|
$input.value = transcript;
|
||||||
|
});
|
||||||
|
|
||||||
|
recognition.addEventListener('end', () => {
|
||||||
|
recording = false;
|
||||||
|
$mic.classList.remove('recording');
|
||||||
|
recognition = null;
|
||||||
|
|
||||||
|
// Auto-send if we got text
|
||||||
|
const text = $input.value.trim();
|
||||||
|
if (text) {
|
||||||
|
$form.dispatchEvent(new Event('submit', { cancelable: true }));
|
||||||
|
} else {
|
||||||
|
flashMicEmpty();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
recognition.addEventListener('error', (e) => {
|
||||||
|
recording = false;
|
||||||
|
$mic.classList.remove('recording');
|
||||||
|
recognition = null;
|
||||||
|
|
||||||
|
// 'no-speech' and 'aborted' are expected — not worth alarming the user
|
||||||
|
if (e.error === 'no-speech') {
|
||||||
|
flashMicEmpty();
|
||||||
|
} else if (e.error !== 'aborted') {
|
||||||
|
console.warn('speech recognition error:', e.error);
|
||||||
|
flashMicEmpty();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
recognition.start();
|
||||||
recordedChunks = [];
|
|
||||||
// Prefer webm/opus if browser supports it (Chrome/FF). Safari may need fallback.
|
|
||||||
const mimeTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', ''];
|
|
||||||
const mime = mimeTypes.find(m => !m || MediaRecorder.isTypeSupported(m)) || '';
|
|
||||||
mediaRecorder = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined);
|
|
||||||
mediaRecorder.addEventListener('dataavailable', (e) => {
|
|
||||||
if (e.data && e.data.size > 0) recordedChunks.push(e.data);
|
|
||||||
});
|
|
||||||
mediaRecorder.addEventListener('stop', () => {
|
|
||||||
// release the mic immediately
|
|
||||||
stream.getTracks().forEach(t => t.stop());
|
|
||||||
handleRecorded();
|
|
||||||
});
|
|
||||||
mediaRecorder.start();
|
|
||||||
recording = true;
|
recording = true;
|
||||||
$mic.classList.add('recording');
|
$mic.classList.add('recording');
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.warn('microphone unavailable:', err.message);
|
console.warn('speech recognition failed to start:', err.message);
|
||||||
recording = false;
|
recording = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function stopRecording() {
|
function stopRecording() {
|
||||||
if (!mediaRecorder || mediaRecorder.state === 'inactive') return;
|
if (!recognition) return;
|
||||||
mediaRecorder.stop();
|
try { recognition.stop(); } catch {}
|
||||||
recording = false;
|
// 'end' event handler cleans up recording state and sends
|
||||||
$mic.classList.remove('recording');
|
|
||||||
$mic.classList.add('transcribing');
|
|
||||||
}
|
|
||||||
|
|
||||||
async function handleRecorded() {
|
|
||||||
if (!recordedChunks.length) {
|
|
||||||
$mic.classList.remove('transcribing');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const blob = new Blob(recordedChunks, { type: recordedChunks[0].type || 'audio/webm' });
|
|
||||||
const ext = (blob.type.split('/')[1] || 'webm').split(';')[0];
|
|
||||||
const form = new FormData();
|
|
||||||
form.append('audio', blob, `speech.${ext}`);
|
|
||||||
|
|
||||||
try {
|
|
||||||
const resp = await fetch('/api/transcribe', { method: 'POST', body: form });
|
|
||||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
|
|
||||||
const { text } = await resp.json();
|
|
||||||
if (text && text.trim()) {
|
|
||||||
$input.value = text;
|
|
||||||
// auto-send — Her vibe is "speak and she hears"
|
|
||||||
$form.dispatchEvent(new Event('submit', { cancelable: true }));
|
|
||||||
} else {
|
|
||||||
flashMicEmpty();
|
|
||||||
}
|
|
||||||
} catch (err) {
|
|
||||||
console.warn('transcription failed:', err.message);
|
|
||||||
flashMicEmpty();
|
|
||||||
} finally {
|
|
||||||
$mic.classList.remove('transcribing');
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function flashMicEmpty() {
|
function flashMicEmpty() {
|
||||||
// brief visual hint that nothing was heard — no toast, no popup, just a flash
|
|
||||||
$mic.classList.add('empty');
|
$mic.classList.add('empty');
|
||||||
setTimeout(() => $mic.classList.remove('empty'), 700);
|
setTimeout(() => $mic.classList.remove('empty'), 700);
|
||||||
}
|
}
|
||||||
|
|||||||
+29
-75
@@ -1,101 +1,55 @@
|
|||||||
"""Piper TTS adapter for chat.saiden.dev.
|
"""HTTP TTS adapter for chat.saiden.dev — madcat-tts daemon (chatterbox).
|
||||||
|
|
||||||
Synthesises text → WAV bytes by subprocess'ing the `piper` CLI binary
|
Calls the madcat-tts daemon's OpenAI-compatible /v1/audio/speech endpoint
|
||||||
(already installed on every host that runs marauder-os).
|
to synthesize text → WAV bytes via chatterbox voice cloning.
|
||||||
|
|
||||||
Designed to fail silently — if piper is missing or synthesis errors,
|
Designed to fail silently — if the daemon is down or synthesis errors,
|
||||||
the chat still works, just without voice.
|
the chat still works, just without voice.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import shutil
|
|
||||||
import tempfile
|
import httpx
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
log = logging.getLogger("chat-saiden.tts")
|
log = logging.getLogger("chat-saiden.tts")
|
||||||
|
|
||||||
# Where the voice .onnx files live across hosts.
|
MADCAT_TTS_URL = os.environ.get("MADCAT_TTS_URL", "http://localhost:14099")
|
||||||
# Order: env override → macOS marauder → linux marauder → linux marauder-agent (mesh node) → linux ~/.local
|
|
||||||
_VOICE_SEARCH_PATHS = [
|
|
||||||
Path.home() / "Library/Application Support/marauder/voices",
|
|
||||||
Path("/home") / os.environ.get("USER", "marauder") / ".local/share/marauder/voices",
|
|
||||||
Path.home() / ".local/share/marauder/voices",
|
|
||||||
Path.home() / ".local/share/psn/voices",
|
|
||||||
Path.home() / ".local/share/piper/voices",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_voice_path(name: str) -> Path | None:
|
|
||||||
"""Return absolute path to a voice model by short name, or None."""
|
|
||||||
# explicit override
|
|
||||||
override = os.environ.get("TTS_VOICE_PATH")
|
|
||||||
if override:
|
|
||||||
p = Path(override)
|
|
||||||
return p if p.exists() else None
|
|
||||||
|
|
||||||
for base in _VOICE_SEARCH_PATHS:
|
|
||||||
candidate = base / f"{name}.onnx"
|
|
||||||
if candidate.exists():
|
|
||||||
return candidate
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
PIPER_BIN = shutil.which("piper") or os.environ.get("PIPER_BIN")
|
|
||||||
|
|
||||||
|
|
||||||
class TTS:
|
class TTS:
|
||||||
"""Subprocess-based piper synthesizer with graceful fallback."""
|
"""HTTP-based madcat-tts synthesizer with graceful fallback."""
|
||||||
|
|
||||||
def __init__(self, voice: str = "en_US-amy-medium") -> None:
|
def __init__(self, voice: str = "bt7274-en") -> None:
|
||||||
self.voice = voice
|
self.voice = voice
|
||||||
self.voice_path = _resolve_voice_path(voice)
|
self._url = f"{MADCAT_TTS_URL.rstrip('/')}/v1/audio/speech"
|
||||||
self.bin = PIPER_BIN
|
log.info("TTS enabled — voice=%s url=%s", voice, self._url)
|
||||||
if not self.bin:
|
|
||||||
log.warning("piper binary not found on PATH — TTS disabled")
|
|
||||||
elif not self.voice_path:
|
|
||||||
log.warning("voice '%s' not found in known locations — TTS disabled", voice)
|
|
||||||
else:
|
|
||||||
log.info("TTS enabled — voice=%s path=%s bin=%s", voice, self.voice_path, self.bin)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def available(self) -> bool:
|
def available(self) -> bool:
|
||||||
return bool(self.bin and self.voice_path)
|
return True
|
||||||
|
|
||||||
async def synthesize(self, text: str) -> bytes | None:
|
async def synthesize(self, text: str) -> bytes | None:
|
||||||
"""Return WAV bytes, or None on failure / unavailable."""
|
"""Return WAV bytes, or None on failure / unavailable."""
|
||||||
if not self.available:
|
if not text or not text.strip():
|
||||||
return None
|
return None
|
||||||
if not text.strip():
|
|
||||||
return None
|
|
||||||
|
|
||||||
# piper wants an output file path (no stdout streaming for WAV in older versions)
|
|
||||||
out = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
||||||
out.close()
|
|
||||||
out_path = out.name
|
|
||||||
try:
|
try:
|
||||||
proc = await asyncio.create_subprocess_exec(
|
async with httpx.AsyncClient(timeout=15.0) as client:
|
||||||
self.bin,
|
resp = await client.post(
|
||||||
"--model", str(self.voice_path),
|
self._url,
|
||||||
"--output_file", out_path,
|
json={
|
||||||
stdin=asyncio.subprocess.PIPE,
|
"input": text,
|
||||||
stdout=asyncio.subprocess.DEVNULL,
|
"voice": self.voice,
|
||||||
stderr=asyncio.subprocess.PIPE,
|
"response_format": "wav",
|
||||||
)
|
},
|
||||||
_, stderr = await proc.communicate(text.encode("utf-8"))
|
)
|
||||||
if proc.returncode != 0:
|
resp.raise_for_status()
|
||||||
log.error("piper exited %s: %s", proc.returncode, stderr.decode("utf-8", "replace")[:300])
|
return resp.content
|
||||||
return None
|
except httpx.TimeoutException:
|
||||||
with open(out_path, "rb") as f:
|
log.warning("TTS timeout for voice=%s (text=%s…)", self.voice, text[:60])
|
||||||
return f.read()
|
return None
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception("piper synthesis failed")
|
log.exception("TTS synthesis failed for voice=%s", self.voice)
|
||||||
return None
|
return None
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
os.unlink(out_path)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|||||||
Reference in New Issue
Block a user