chore: initial commit — chat-saiden web chat baseline
This commit is contained in:
+143
@@ -0,0 +1,143 @@
|
||||
"""Whisper.cpp STT adapter for chat.saiden.dev.
|
||||
|
||||
Transcribes microphone audio (webm/opus from browser) → text.
|
||||
Pipeline: ffmpeg → 16kHz mono WAV → whisper-cli → stdout text.
|
||||
|
||||
Fails silently if the binary or model is missing.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
# Whisper.cpp special tokens — emitted for non-speech audio.
|
||||
# If the entire transcript is one of these, treat as no speech.
|
||||
_NON_SPEECH = re.compile(
|
||||
r"^\s*[\[\(](?:BLANK_AUDIO|INAUDIBLE|NO_SPEECH|MUSIC|NOISE|SILENCE|SOUND|"
|
||||
r"APPLAUSE|LAUGHTER|CROSSTALK|BREATHING|UNINTELLIGIBLE)[\]\)]\s*$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
log = logging.getLogger("chat-saiden.stt")
|
||||
|
||||
WHISPER_BIN = shutil.which("whisper-cli") or os.environ.get("WHISPER_BIN")
|
||||
FFMPEG_BIN = shutil.which("ffmpeg") or os.environ.get("FFMPEG_BIN")
|
||||
|
||||
_MODEL_SEARCH = [
|
||||
Path.home() / ".cache/whisper/ggml-base.en.bin",
|
||||
Path.home() / ".cache/whisper/ggml-small.en.bin",
|
||||
Path.home() / ".cache/whisper/ggml-tiny.en.bin",
|
||||
Path("/usr/local/share/whisper.cpp/ggml-base.en.bin"),
|
||||
Path("/usr/share/whisper.cpp/ggml-base.en.bin"),
|
||||
]
|
||||
|
||||
|
||||
def _resolve_model() -> Path | None:
|
||||
override = os.environ.get("WHISPER_MODEL_PATH")
|
||||
if override:
|
||||
p = Path(override)
|
||||
return p if p.exists() else None
|
||||
for cand in _MODEL_SEARCH:
|
||||
if cand.exists():
|
||||
return cand
|
||||
return None
|
||||
|
||||
|
||||
class STT:
|
||||
"""Whisper-cpp wrapper."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.bin = WHISPER_BIN
|
||||
self.ffmpeg = FFMPEG_BIN
|
||||
self.model = _resolve_model()
|
||||
if not self.bin:
|
||||
log.warning("whisper-cli not found — STT disabled")
|
||||
elif not self.ffmpeg:
|
||||
log.warning("ffmpeg not found — STT disabled")
|
||||
elif not self.model:
|
||||
log.warning("no whisper model in known locations — STT disabled")
|
||||
else:
|
||||
log.info("STT enabled — model=%s bin=%s", self.model, self.bin)
|
||||
|
||||
@property
|
||||
def available(self) -> bool:
|
||||
return bool(self.bin and self.ffmpeg and self.model)
|
||||
|
||||
async def transcribe(self, audio_bytes: bytes, suffix: str = ".webm") -> str | None:
|
||||
"""Return transcript text, or None on failure / unavailable."""
|
||||
if not self.available:
|
||||
return None
|
||||
if not audio_bytes:
|
||||
return None
|
||||
|
||||
tmp_in = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
||||
tmp_in.write(audio_bytes)
|
||||
tmp_in.close()
|
||||
|
||||
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
tmp_wav.close()
|
||||
|
||||
try:
|
||||
# 1. ffmpeg: convert to 16kHz mono WAV (whisper's expected format)
|
||||
ff = await asyncio.create_subprocess_exec(
|
||||
self.ffmpeg, "-y", "-loglevel", "error",
|
||||
"-i", tmp_in.name,
|
||||
"-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le",
|
||||
tmp_wav.name,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_, ff_err = await ff.communicate()
|
||||
if ff.returncode != 0:
|
||||
log.error("ffmpeg failed: %s", ff_err.decode("utf-8", "replace")[:300])
|
||||
return None
|
||||
|
||||
# 2. whisper-cli: transcribe → plain text on stdout
|
||||
wh = await asyncio.create_subprocess_exec(
|
||||
self.bin,
|
||||
"-m", str(self.model),
|
||||
"-f", tmp_wav.name,
|
||||
"--no-timestamps",
|
||||
"--no-prints",
|
||||
"--output-txt",
|
||||
"-of", tmp_wav.name + ".out",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_, wh_err = await wh.communicate()
|
||||
if wh.returncode != 0:
|
||||
log.error("whisper failed: %s", wh_err.decode("utf-8", "replace")[:300])
|
||||
return None
|
||||
|
||||
txt_path = Path(tmp_wav.name + ".out.txt")
|
||||
if not txt_path.exists():
|
||||
log.error("whisper produced no output file")
|
||||
return None
|
||||
text = txt_path.read_text(encoding="utf-8").strip()
|
||||
try:
|
||||
txt_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
if not text:
|
||||
return None
|
||||
# filter non-speech markers — whisper.cpp emits "[BLANK_AUDIO]" etc.
|
||||
if _NON_SPEECH.match(text):
|
||||
log.info("transcript was non-speech marker: %r", text)
|
||||
return None
|
||||
return text
|
||||
|
||||
except Exception:
|
||||
log.exception("transcribe failed")
|
||||
return None
|
||||
finally:
|
||||
for p in (tmp_in.name, tmp_wav.name):
|
||||
try:
|
||||
os.unlink(p)
|
||||
except OSError:
|
||||
pass
|
||||
Reference in New Issue
Block a user