chat/app/stt.py

"""Whisper.cpp STT adapter for chat.saiden.dev.

Transcribes microphone audio (webm/opus from browser) → text.
Pipeline: ffmpeg → 16kHz mono WAV → whisper-cli → stdout text.

Fails silently if the binary or model is missing.
"""

from __future__ import annotations

import asyncio
import logging
import os
import re
import shutil
import tempfile
from pathlib import Path

# Whisper.cpp special tokens — emitted for non-speech audio.
# If the entire transcript is one of these, treat as no speech.
_NON_SPEECH = re.compile(
    r"^\s*[\[\(](?:BLANK_AUDIO|INAUDIBLE|NO_SPEECH|MUSIC|NOISE|SILENCE|SOUND|"
    r"APPLAUSE|LAUGHTER|CROSSTALK|BREATHING|UNINTELLIGIBLE)[\]\)]\s*$",
    re.IGNORECASE,
)

log = logging.getLogger("chat-saiden.stt")

WHISPER_BIN = shutil.which("whisper-cli") or os.environ.get("WHISPER_BIN")
FFMPEG_BIN = shutil.which("ffmpeg") or os.environ.get("FFMPEG_BIN")

_MODEL_SEARCH = [
    Path.home() / ".cache/whisper/ggml-base.en.bin",
    Path.home() / ".cache/whisper/ggml-small.en.bin",
    Path.home() / ".cache/whisper/ggml-tiny.en.bin",
    Path("/usr/local/share/whisper.cpp/ggml-base.en.bin"),
    Path("/usr/share/whisper.cpp/ggml-base.en.bin"),
]


def _resolve_model() -> Path | None:
    override = os.environ.get("WHISPER_MODEL_PATH")
    if override:
        p = Path(override)
        return p if p.exists() else None
    for cand in _MODEL_SEARCH:
        if cand.exists():
            return cand
    return None


class STT:
    """Whisper-cpp wrapper."""

    def __init__(self) -> None:
        self.bin = WHISPER_BIN
        self.ffmpeg = FFMPEG_BIN
        self.model = _resolve_model()
        if not self.bin:
            log.warning("whisper-cli not found — STT disabled")
        elif not self.ffmpeg:
            log.warning("ffmpeg not found — STT disabled")
        elif not self.model:
            log.warning("no whisper model in known locations — STT disabled")
        else:
            log.info("STT enabled — model=%s bin=%s", self.model, self.bin)

    @property
    def available(self) -> bool:
        return bool(self.bin and self.ffmpeg and self.model)

    async def transcribe(self, audio_bytes: bytes, suffix: str = ".webm") -> str | None:
        """Return transcript text, or None on failure / unavailable."""
        if not self.available:
            return None
        if not audio_bytes:
            return None

        tmp_in = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
        tmp_in.write(audio_bytes)
        tmp_in.close()

        tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        tmp_wav.close()

        try:
            # 1. ffmpeg: convert to 16kHz mono WAV (whisper's expected format)
            ff = await asyncio.create_subprocess_exec(
                self.ffmpeg, "-y", "-loglevel", "error",
                "-i", tmp_in.name,
                "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
                tmp_wav.name,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            _, ff_err = await ff.communicate()
            if ff.returncode != 0:
                log.error("ffmpeg failed: %s", ff_err.decode("utf-8", "replace")[:300])
                return None

            # 2. whisper-cli: transcribe → plain text on stdout
            wh = await asyncio.create_subprocess_exec(
                self.bin,
                "-m", str(self.model),
                "-f", tmp_wav.name,
                "--no-timestamps",
                "--no-prints",
                "--output-txt",
                "-of", tmp_wav.name + ".out",
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            _, wh_err = await wh.communicate()
            if wh.returncode != 0:
                log.error("whisper failed: %s", wh_err.decode("utf-8", "replace")[:300])
                return None

            txt_path = Path(tmp_wav.name + ".out.txt")
            if not txt_path.exists():
                log.error("whisper produced no output file")
                return None
            text = txt_path.read_text(encoding="utf-8").strip()
            try:
                txt_path.unlink()
            except OSError:
                pass
            if not text:
                return None
            # filter non-speech markers — whisper.cpp emits "[BLANK_AUDIO]" etc.
            if _NON_SPEECH.match(text):
                log.info("transcript was non-speech marker: %r", text)
                return None
            return text

        except Exception:
            log.exception("transcribe failed")
            return None
        finally:
            for p in (tmp_in.name, tmp_wav.name):
                try:
                    os.unlink(p)
                except OSError:
                    pass