feat: gpu-check probe + resident TTS daemon (serve)

Add two subcommands and the deps they need, shipped as madcat-say 0.1.0.

gpu-check (Sources/GpuCheck.swift)
  Queries the Metal device and runs a small MLX compute probe to verify
  the GPU pipeline + bundled mlx.metallib resolve before synth.

serve (Sources/Serve.swift)
  Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads
  VoxCPM2 once behind an actor (serializes the single GPU), warms the
  pipeline at boot, caches the last reference voice. Routes:
    GET  /health            -> {status,model,ready,uptime_s}
    POST /v1/audio/speech   {input|text, voice?, language?, timesteps?,
                             cfg?, prepad?} -> audio/wav
  Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load).

speak (Sources/MadcatSay.swift)
  Probes the daemon /health (0.6s) and forwards over HTTP when up;
  falls back to in-process load otherwise. New flags --daemon-port,
  --no-daemon.

Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17
  (HTTP only, no WebSocket — avoids the swift-websocket pin).

Makefile: resolve the metallib via `swift build --show-bin-path` on
  both packages instead of the triple-prefixed/symlink path, which is
  not always present in speech-swift.
This commit is contained in:
madcat-core
2026-06-10 21:04:19 +02:00
parent 67be877f7f
commit da04416ea4
5 changed files with 520 additions and 11 deletions
+92 -4
View File
@@ -3,16 +3,30 @@ import AudioCommon
import Foundation
@preconcurrency import VoxCPM2TTS
/// madcat-say speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
/// madcat-say on-device voice-cloning TTS using VoxCPM2 (MLX/Metal).
///
/// Root command. `speak` is the default subcommand, so the historical
/// `madcat-say "text"` form keeps working unchanged; `madcat-say gpu-check`
/// runs the Metal/GPU diagnostic.
@main
struct MadcatSay: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "madcat-say",
abstract: "On-device voice cloning with VoxCPM2 (MLX/Metal).",
subcommands: [Speak.self, GpuCheck.self, Serve.self],
defaultSubcommand: Speak.self
)
}
/// Speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
///
/// madcat-say "Hello there"
/// madcat-say -r samantha.wav "I was calibrated just for you."
/// madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha."
/// madcat-say -r samantha.wav -o out.wav "Saved instead of played."
@main
struct MadcatSay: AsyncParsableCommand {
struct Speak: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "madcat-say",
commandName: "speak",
abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).",
discussion: """
With no -r, uses the model's default voice. With -r it zero-shot clones
@@ -46,7 +60,19 @@ struct MadcatSay: AsyncParsableCommand {
@Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).")
var prepad: Double = 0.1
@Option(name: .long, help: "Resident daemon port to try first (default 8765).")
var daemonPort: Int = 8765
@Flag(name: .long, help: "Bypass the resident daemon; always load the model in-process.")
var noDaemon: Bool = false
func run() async throws {
// Fast path: forward to the resident `serve` daemon if it is up, so the
// model stays warm in GPU memory instead of being cold-loaded per call.
if !noDaemon, await trySpeakViaDaemon() {
return
}
let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
log("Loading \(modelId) ...")
let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
@@ -93,6 +119,68 @@ struct MadcatSay: AsyncParsableCommand {
// MARK: - Helpers
/// Try to satisfy this request via the resident `serve` daemon. Returns true
/// if the daemon handled it (audio played or saved); false on any failure so
/// the caller falls back to an in-process model load. Never throws a down
/// or broken daemon must degrade gracefully, not abort the command.
private func trySpeakViaDaemon() async -> Bool {
let base = "http://127.0.0.1:\(daemonPort)"
let session = URLSession(configuration: .ephemeral)
// Health probe with a short timeout so a missing daemon costs ~nothing.
guard let healthURL = URL(string: base + "/health") else { return false }
var healthReq = URLRequest(url: healthURL)
healthReq.timeoutInterval = 0.6
do {
let (_, resp) = try await session.data(for: healthReq)
guard let http = resp as? HTTPURLResponse, http.statusCode == 200 else { return false }
} catch {
return false
}
// Build the synthesis request.
var payload: [String: Any] = ["input": text, "timesteps": timesteps, "cfg": Double(cfg), "prepad": prepad]
if let language { payload["language"] = language }
if let reference { payload["voice"] = (reference as NSString).expandingTildeInPath }
guard let speakURL = URL(string: base + "/v1/audio/speech"),
let bodyData = try? JSONSerialization.data(withJSONObject: payload) else {
return false
}
var req = URLRequest(url: speakURL)
req.httpMethod = "POST"
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.httpBody = bodyData
req.timeoutInterval = 300
log("Using resident daemon on port \(daemonPort) ...")
do {
let (wav, resp) = try await session.data(for: req)
guard let http = resp as? HTTPURLResponse, http.statusCode == 200, !wav.isEmpty else {
log("Daemon returned no audio; falling back to in-process load.")
return false
}
if let output {
try wav.write(to: URL(fileURLWithPath: output))
log("Saved \(wav.count) bytes to \(output)")
} else {
let tmp = FileManager.default.temporaryDirectory
.appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
try wav.write(to: tmp)
defer { try? FileManager.default.removeItem(at: tmp) }
let proc = Process()
proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
proc.arguments = [tmp.path]
try proc.run()
proc.waitUntilExit()
}
return true
} catch {
log("Daemon request failed (\(error)); falling back to in-process load.")
return false
}
}
private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws {
let tmp = FileManager.default.temporaryDirectory
.appendingPathComponent("madcat-say-\(UUID().uuidString).wav")