chore: initial commit — chat-saiden web chat baseline

2026-05-29 13:47:34 +02:00
commit 96ba8f4b6e
28 changed files with 4852 additions and 0 deletions
@@ -0,0 +1,143 @@
+"""Whisper.cpp STT adapter for chat.saiden.dev.
+
+Transcribes microphone audio (webm/opus from browser) → text.
+Pipeline: ffmpeg → 16kHz mono WAV → whisper-cli → stdout text.
+
+Fails silently if the binary or model is missing.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import re
+import shutil
+import tempfile
+from pathlib import Path
+
+# Whisper.cpp special tokens — emitted for non-speech audio.
+# If the entire transcript is one of these, treat as no speech.
+_NON_SPEECH = re.compile(
+    r"^\s*[\[\(](?:BLANK_AUDIO|INAUDIBLE|NO_SPEECH|MUSIC|NOISE|SILENCE|SOUND|"
+    r"APPLAUSE|LAUGHTER|CROSSTALK|BREATHING|UNINTELLIGIBLE)[\]\)]\s*$",
+    re.IGNORECASE,
+)
+
+log = logging.getLogger("chat-saiden.stt")
+
+WHISPER_BIN = shutil.which("whisper-cli") or os.environ.get("WHISPER_BIN")
+FFMPEG_BIN = shutil.which("ffmpeg") or os.environ.get("FFMPEG_BIN")
+
+_MODEL_SEARCH = [
+    Path.home() / ".cache/whisper/ggml-base.en.bin",
+    Path.home() / ".cache/whisper/ggml-small.en.bin",
+    Path.home() / ".cache/whisper/ggml-tiny.en.bin",
+    Path("/usr/local/share/whisper.cpp/ggml-base.en.bin"),
+    Path("/usr/share/whisper.cpp/ggml-base.en.bin"),
+]
+
+
+def _resolve_model() -> Path | None:
+    override = os.environ.get("WHISPER_MODEL_PATH")
+    if override:
+        p = Path(override)
+        return p if p.exists() else None
+    for cand in _MODEL_SEARCH:
+        if cand.exists():
+            return cand
+    return None
+
+
+class STT:
+    """Whisper-cpp wrapper."""
+
+    def __init__(self) -> None:
+        self.bin = WHISPER_BIN
+        self.ffmpeg = FFMPEG_BIN
+        self.model = _resolve_model()
+        if not self.bin:
+            log.warning("whisper-cli not found — STT disabled")
+        elif not self.ffmpeg:
+            log.warning("ffmpeg not found — STT disabled")
+        elif not self.model:
+            log.warning("no whisper model in known locations — STT disabled")
+        else:
+            log.info("STT enabled — model=%s bin=%s", self.model, self.bin)
+
+    @property
+    def available(self) -> bool:
+        return bool(self.bin and self.ffmpeg and self.model)
+
+    async def transcribe(self, audio_bytes: bytes, suffix: str = ".webm") -> str | None:
+        """Return transcript text, or None on failure / unavailable."""
+        if not self.available:
+            return None
+        if not audio_bytes:
+            return None
+
+        tmp_in = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+        tmp_in.write(audio_bytes)
+        tmp_in.close()
+
+        tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        tmp_wav.close()
+
+        try:
+            # 1. ffmpeg: convert to 16kHz mono WAV (whisper's expected format)
+            ff = await asyncio.create_subprocess_exec(
+                self.ffmpeg, "-y", "-loglevel", "error",
+                "-i", tmp_in.name,
+                "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
+                tmp_wav.name,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            _, ff_err = await ff.communicate()
+            if ff.returncode != 0:
+                log.error("ffmpeg failed: %s", ff_err.decode("utf-8", "replace")[:300])
+                return None
+
+            # 2. whisper-cli: transcribe → plain text on stdout
+            wh = await asyncio.create_subprocess_exec(
+                self.bin,
+                "-m", str(self.model),
+                "-f", tmp_wav.name,
+                "--no-timestamps",
+                "--no-prints",
+                "--output-txt",
+                "-of", tmp_wav.name + ".out",
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            _, wh_err = await wh.communicate()
+            if wh.returncode != 0:
+                log.error("whisper failed: %s", wh_err.decode("utf-8", "replace")[:300])
+                return None
+
+            txt_path = Path(tmp_wav.name + ".out.txt")
+            if not txt_path.exists():
+                log.error("whisper produced no output file")
+                return None
+            text = txt_path.read_text(encoding="utf-8").strip()
+            try:
+                txt_path.unlink()
+            except OSError:
+                pass
+            if not text:
+                return None
+            # filter non-speech markers — whisper.cpp emits "[BLANK_AUDIO]" etc.
+            if _NON_SPEECH.match(text):
+                log.info("transcript was non-speech marker: %r", text)
+                return None
+            return text
+
+        except Exception:
+            log.exception("transcribe failed")
+            return None
+        finally:
+            for p in (tmp_in.name, tmp_wav.name):
+                try:
+                    os.unlink(p)
+                except OSError:
+                    pass