Files
2026-05-29 13:47:34 +02:00

144 lines
4.8 KiB
Python

"""Whisper.cpp STT adapter for chat.saiden.dev.
Transcribes microphone audio (webm/opus from browser) → text.
Pipeline: ffmpeg → 16kHz mono WAV → whisper-cli → stdout text.
Fails silently if the binary or model is missing.
"""
from __future__ import annotations
import asyncio
import logging
import os
import re
import shutil
import tempfile
from pathlib import Path
# Whisper.cpp special tokens — emitted for non-speech audio.
# If the entire transcript is one of these, treat as no speech.
_NON_SPEECH = re.compile(
r"^\s*[\[\(](?:BLANK_AUDIO|INAUDIBLE|NO_SPEECH|MUSIC|NOISE|SILENCE|SOUND|"
r"APPLAUSE|LAUGHTER|CROSSTALK|BREATHING|UNINTELLIGIBLE)[\]\)]\s*$",
re.IGNORECASE,
)
log = logging.getLogger("chat-saiden.stt")
WHISPER_BIN = shutil.which("whisper-cli") or os.environ.get("WHISPER_BIN")
FFMPEG_BIN = shutil.which("ffmpeg") or os.environ.get("FFMPEG_BIN")
_MODEL_SEARCH = [
Path.home() / ".cache/whisper/ggml-base.en.bin",
Path.home() / ".cache/whisper/ggml-small.en.bin",
Path.home() / ".cache/whisper/ggml-tiny.en.bin",
Path("/usr/local/share/whisper.cpp/ggml-base.en.bin"),
Path("/usr/share/whisper.cpp/ggml-base.en.bin"),
]
def _resolve_model() -> Path | None:
override = os.environ.get("WHISPER_MODEL_PATH")
if override:
p = Path(override)
return p if p.exists() else None
for cand in _MODEL_SEARCH:
if cand.exists():
return cand
return None
class STT:
"""Whisper-cpp wrapper."""
def __init__(self) -> None:
self.bin = WHISPER_BIN
self.ffmpeg = FFMPEG_BIN
self.model = _resolve_model()
if not self.bin:
log.warning("whisper-cli not found — STT disabled")
elif not self.ffmpeg:
log.warning("ffmpeg not found — STT disabled")
elif not self.model:
log.warning("no whisper model in known locations — STT disabled")
else:
log.info("STT enabled — model=%s bin=%s", self.model, self.bin)
@property
def available(self) -> bool:
return bool(self.bin and self.ffmpeg and self.model)
async def transcribe(self, audio_bytes: bytes, suffix: str = ".webm") -> str | None:
"""Return transcript text, or None on failure / unavailable."""
if not self.available:
return None
if not audio_bytes:
return None
tmp_in = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
tmp_in.write(audio_bytes)
tmp_in.close()
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
tmp_wav.close()
try:
# 1. ffmpeg: convert to 16kHz mono WAV (whisper's expected format)
ff = await asyncio.create_subprocess_exec(
self.ffmpeg, "-y", "-loglevel", "error",
"-i", tmp_in.name,
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
tmp_wav.name,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_, ff_err = await ff.communicate()
if ff.returncode != 0:
log.error("ffmpeg failed: %s", ff_err.decode("utf-8", "replace")[:300])
return None
# 2. whisper-cli: transcribe → plain text on stdout
wh = await asyncio.create_subprocess_exec(
self.bin,
"-m", str(self.model),
"-f", tmp_wav.name,
"--no-timestamps",
"--no-prints",
"--output-txt",
"-of", tmp_wav.name + ".out",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
_, wh_err = await wh.communicate()
if wh.returncode != 0:
log.error("whisper failed: %s", wh_err.decode("utf-8", "replace")[:300])
return None
txt_path = Path(tmp_wav.name + ".out.txt")
if not txt_path.exists():
log.error("whisper produced no output file")
return None
text = txt_path.read_text(encoding="utf-8").strip()
try:
txt_path.unlink()
except OSError:
pass
if not text:
return None
# filter non-speech markers — whisper.cpp emits "[BLANK_AUDIO]" etc.
if _NON_SPEECH.match(text):
log.info("transcript was non-speech marker: %r", text)
return None
return text
except Exception:
log.exception("transcribe failed")
return None
finally:
for p in (tmp_in.name, tmp_wav.name):
try:
os.unlink(p)
except OSError:
pass