144 lines
4.8 KiB
Python
144 lines
4.8 KiB
Python
"""Whisper.cpp STT adapter for chat.saiden.dev.
|
|
|
|
Transcribes microphone audio (webm/opus from browser) → text.
|
|
Pipeline: ffmpeg → 16kHz mono WAV → whisper-cli → stdout text.
|
|
|
|
Fails silently if the binary or model is missing.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
# Whisper.cpp special tokens — emitted for non-speech audio.
|
|
# If the entire transcript is one of these, treat as no speech.
|
|
_NON_SPEECH = re.compile(
|
|
r"^\s*[\[\(](?:BLANK_AUDIO|INAUDIBLE|NO_SPEECH|MUSIC|NOISE|SILENCE|SOUND|"
|
|
r"APPLAUSE|LAUGHTER|CROSSTALK|BREATHING|UNINTELLIGIBLE)[\]\)]\s*$",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
log = logging.getLogger("chat-saiden.stt")
|
|
|
|
WHISPER_BIN = shutil.which("whisper-cli") or os.environ.get("WHISPER_BIN")
|
|
FFMPEG_BIN = shutil.which("ffmpeg") or os.environ.get("FFMPEG_BIN")
|
|
|
|
_MODEL_SEARCH = [
|
|
Path.home() / ".cache/whisper/ggml-base.en.bin",
|
|
Path.home() / ".cache/whisper/ggml-small.en.bin",
|
|
Path.home() / ".cache/whisper/ggml-tiny.en.bin",
|
|
Path("/usr/local/share/whisper.cpp/ggml-base.en.bin"),
|
|
Path("/usr/share/whisper.cpp/ggml-base.en.bin"),
|
|
]
|
|
|
|
|
|
def _resolve_model() -> Path | None:
|
|
override = os.environ.get("WHISPER_MODEL_PATH")
|
|
if override:
|
|
p = Path(override)
|
|
return p if p.exists() else None
|
|
for cand in _MODEL_SEARCH:
|
|
if cand.exists():
|
|
return cand
|
|
return None
|
|
|
|
|
|
class STT:
|
|
"""Whisper-cpp wrapper."""
|
|
|
|
def __init__(self) -> None:
|
|
self.bin = WHISPER_BIN
|
|
self.ffmpeg = FFMPEG_BIN
|
|
self.model = _resolve_model()
|
|
if not self.bin:
|
|
log.warning("whisper-cli not found — STT disabled")
|
|
elif not self.ffmpeg:
|
|
log.warning("ffmpeg not found — STT disabled")
|
|
elif not self.model:
|
|
log.warning("no whisper model in known locations — STT disabled")
|
|
else:
|
|
log.info("STT enabled — model=%s bin=%s", self.model, self.bin)
|
|
|
|
@property
|
|
def available(self) -> bool:
|
|
return bool(self.bin and self.ffmpeg and self.model)
|
|
|
|
async def transcribe(self, audio_bytes: bytes, suffix: str = ".webm") -> str | None:
|
|
"""Return transcript text, or None on failure / unavailable."""
|
|
if not self.available:
|
|
return None
|
|
if not audio_bytes:
|
|
return None
|
|
|
|
tmp_in = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
|
tmp_in.write(audio_bytes)
|
|
tmp_in.close()
|
|
|
|
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
tmp_wav.close()
|
|
|
|
try:
|
|
# 1. ffmpeg: convert to 16kHz mono WAV (whisper's expected format)
|
|
ff = await asyncio.create_subprocess_exec(
|
|
self.ffmpeg, "-y", "-loglevel", "error",
|
|
"-i", tmp_in.name,
|
|
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
|
|
tmp_wav.name,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
_, ff_err = await ff.communicate()
|
|
if ff.returncode != 0:
|
|
log.error("ffmpeg failed: %s", ff_err.decode("utf-8", "replace")[:300])
|
|
return None
|
|
|
|
# 2. whisper-cli: transcribe → plain text on stdout
|
|
wh = await asyncio.create_subprocess_exec(
|
|
self.bin,
|
|
"-m", str(self.model),
|
|
"-f", tmp_wav.name,
|
|
"--no-timestamps",
|
|
"--no-prints",
|
|
"--output-txt",
|
|
"-of", tmp_wav.name + ".out",
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
_, wh_err = await wh.communicate()
|
|
if wh.returncode != 0:
|
|
log.error("whisper failed: %s", wh_err.decode("utf-8", "replace")[:300])
|
|
return None
|
|
|
|
txt_path = Path(tmp_wav.name + ".out.txt")
|
|
if not txt_path.exists():
|
|
log.error("whisper produced no output file")
|
|
return None
|
|
text = txt_path.read_text(encoding="utf-8").strip()
|
|
try:
|
|
txt_path.unlink()
|
|
except OSError:
|
|
pass
|
|
if not text:
|
|
return None
|
|
# filter non-speech markers — whisper.cpp emits "[BLANK_AUDIO]" etc.
|
|
if _NON_SPEECH.match(text):
|
|
log.info("transcript was non-speech marker: %r", text)
|
|
return None
|
|
return text
|
|
|
|
except Exception:
|
|
log.exception("transcribe failed")
|
|
return None
|
|
finally:
|
|
for p in (tmp_in.name, tmp_wav.name):
|
|
try:
|
|
os.unlink(p)
|
|
except OSError:
|
|
pass
|