"""Whisper.cpp STT adapter for chat.saiden.dev. Transcribes microphone audio (webm/opus from browser) → text. Pipeline: ffmpeg → 16kHz mono WAV → whisper-cli → stdout text. Fails silently if the binary or model is missing. """ from __future__ import annotations import asyncio import logging import os import re import shutil import tempfile from pathlib import Path # Whisper.cpp special tokens — emitted for non-speech audio. # If the entire transcript is one of these, treat as no speech. _NON_SPEECH = re.compile( r"^\s*[\[\(](?:BLANK_AUDIO|INAUDIBLE|NO_SPEECH|MUSIC|NOISE|SILENCE|SOUND|" r"APPLAUSE|LAUGHTER|CROSSTALK|BREATHING|UNINTELLIGIBLE)[\]\)]\s*$", re.IGNORECASE, ) log = logging.getLogger("chat-saiden.stt") WHISPER_BIN = shutil.which("whisper-cli") or os.environ.get("WHISPER_BIN") FFMPEG_BIN = shutil.which("ffmpeg") or os.environ.get("FFMPEG_BIN") _MODEL_SEARCH = [ Path.home() / ".cache/whisper/ggml-base.en.bin", Path.home() / ".cache/whisper/ggml-small.en.bin", Path.home() / ".cache/whisper/ggml-tiny.en.bin", Path("/usr/local/share/whisper.cpp/ggml-base.en.bin"), Path("/usr/share/whisper.cpp/ggml-base.en.bin"), ] def _resolve_model() -> Path | None: override = os.environ.get("WHISPER_MODEL_PATH") if override: p = Path(override) return p if p.exists() else None for cand in _MODEL_SEARCH: if cand.exists(): return cand return None class STT: """Whisper-cpp wrapper.""" def __init__(self) -> None: self.bin = WHISPER_BIN self.ffmpeg = FFMPEG_BIN self.model = _resolve_model() if not self.bin: log.warning("whisper-cli not found — STT disabled") elif not self.ffmpeg: log.warning("ffmpeg not found — STT disabled") elif not self.model: log.warning("no whisper model in known locations — STT disabled") else: log.info("STT enabled — model=%s bin=%s", self.model, self.bin) @property def available(self) -> bool: return bool(self.bin and self.ffmpeg and self.model) async def transcribe(self, audio_bytes: bytes, suffix: str = ".webm") -> str | None: """Return transcript text, or None on failure / unavailable.""" if not self.available: return None if not audio_bytes: return None tmp_in = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) tmp_in.write(audio_bytes) tmp_in.close() tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) tmp_wav.close() try: # 1. ffmpeg: convert to 16kHz mono WAV (whisper's expected format) ff = await asyncio.create_subprocess_exec( self.ffmpeg, "-y", "-loglevel", "error", "-i", tmp_in.name, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", tmp_wav.name, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) _, ff_err = await ff.communicate() if ff.returncode != 0: log.error("ffmpeg failed: %s", ff_err.decode("utf-8", "replace")[:300]) return None # 2. whisper-cli: transcribe → plain text on stdout wh = await asyncio.create_subprocess_exec( self.bin, "-m", str(self.model), "-f", tmp_wav.name, "--no-timestamps", "--no-prints", "--output-txt", "-of", tmp_wav.name + ".out", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) _, wh_err = await wh.communicate() if wh.returncode != 0: log.error("whisper failed: %s", wh_err.decode("utf-8", "replace")[:300]) return None txt_path = Path(tmp_wav.name + ".out.txt") if not txt_path.exists(): log.error("whisper produced no output file") return None text = txt_path.read_text(encoding="utf-8").strip() try: txt_path.unlink() except OSError: pass if not text: return None # filter non-speech markers — whisper.cpp emits "[BLANK_AUDIO]" etc. if _NON_SPEECH.match(text): log.info("transcript was non-speech marker: %r", text) return None return text except Exception: log.exception("transcribe failed") return None finally: for p in (tmp_in.name, tmp_wav.name): try: os.unlink(p) except OSError: pass