feat: gpu-check probe + resident TTS daemon (serve)

Add two subcommands and the deps they need, shipped as madcat-say 0.1.0. gpu-check (Sources/GpuCheck.swift) Queries the Metal device and runs a small MLX compute probe to verify the GPU pipeline + bundled mlx.metallib resolve before synth. serve (Sources/Serve.swift) Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads VoxCPM2 once behind an actor (serializes the single GPU), warms the pipeline at boot, caches the last reference voice. Routes: GET /health -> {status,model,ready,uptime_s} POST /v1/audio/speech {input|text, voice?, language?, timesteps?, cfg?, prepad?} -> audio/wav Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load). speak (Sources/MadcatSay.swift) Probes the daemon /health (0.6s) and forwards over HTTP when up; falls back to in-process load otherwise. New flags --daemon-port, --no-daemon. Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17 (HTTP only, no WebSocket — avoids the swift-websocket pin). Makefile: resolve the metallib via `swift build --show-bin-path` on both packages instead of the triple-prefixed/symlink path, which is not always present in speech-swift.
2026-06-10 21:04:19 +02:00
parent 67be877f7f
commit da04416ea4
5 changed files with 520 additions and 11 deletions
@@ -3,16 +3,30 @@ import AudioCommon
 import Foundation
@preconcurrency import VoxCPM2TTS

-/// madcat-say — speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
+/// madcat-say — on-device voice-cloning TTS using VoxCPM2 (MLX/Metal).
+///
+/// Root command. `speak` is the default subcommand, so the historical
+/// `madcat-say "text"` form keeps working unchanged; `madcat-say gpu-check`
+/// runs the Metal/GPU diagnostic.
+@main
+struct MadcatSay: AsyncParsableCommand {
+    static let configuration = CommandConfiguration(
+        commandName: "madcat-say",
+        abstract: "On-device voice cloning with VoxCPM2 (MLX/Metal).",
+        subcommands: [Speak.self, GpuCheck.self, Serve.self],
+        defaultSubcommand: Speak.self
+    )
+}
+
+/// Speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
 ///
 ///   madcat-say "Hello there"
 ///   madcat-say -r samantha.wav "I was calibrated just for you."
 ///   madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha."
 ///   madcat-say -r samantha.wav -o out.wav "Saved instead of played."
-@main
-struct MadcatSay: AsyncParsableCommand {
+struct Speak: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
-        commandName: "madcat-say",
+        commandName: "speak",
        abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).",
        discussion: """
        With no -r, uses the model's default voice. With -r it zero-shot clones
@@ -46,7 +60,19 @@ struct MadcatSay: AsyncParsableCommand {
    @Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).")
    var prepad: Double = 0.1

+    @Option(name: .long, help: "Resident daemon port to try first (default 8765).")
+    var daemonPort: Int = 8765
+
+    @Flag(name: .long, help: "Bypass the resident daemon; always load the model in-process.")
+    var noDaemon: Bool = false
+
    func run() async throws {
+        // Fast path: forward to the resident `serve` daemon if it is up, so the
+        // model stays warm in GPU memory instead of being cold-loaded per call.
+        if !noDaemon, await trySpeakViaDaemon() {
+            return
+        }
+
        let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
        log("Loading \(modelId) ...")
        let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
@@ -93,6 +119,68 @@ struct MadcatSay: AsyncParsableCommand {

    // MARK: - Helpers

+    /// Try to satisfy this request via the resident `serve` daemon. Returns true
+    /// if the daemon handled it (audio played or saved); false on any failure so
+    /// the caller falls back to an in-process model load. Never throws — a down
+    /// or broken daemon must degrade gracefully, not abort the command.
+    private func trySpeakViaDaemon() async -> Bool {
+        let base = "http://127.0.0.1:\(daemonPort)"
+        let session = URLSession(configuration: .ephemeral)
+
+        // Health probe with a short timeout so a missing daemon costs ~nothing.
+        guard let healthURL = URL(string: base + "/health") else { return false }
+        var healthReq = URLRequest(url: healthURL)
+        healthReq.timeoutInterval = 0.6
+        do {
+            let (_, resp) = try await session.data(for: healthReq)
+            guard let http = resp as? HTTPURLResponse, http.statusCode == 200 else { return false }
+        } catch {
+            return false
+        }
+
+        // Build the synthesis request.
+        var payload: [String: Any] = ["input": text, "timesteps": timesteps, "cfg": Double(cfg), "prepad": prepad]
+        if let language { payload["language"] = language }
+        if let reference { payload["voice"] = (reference as NSString).expandingTildeInPath }
+
+        guard let speakURL = URL(string: base + "/v1/audio/speech"),
+              let bodyData = try? JSONSerialization.data(withJSONObject: payload) else {
+            return false
+        }
+        var req = URLRequest(url: speakURL)
+        req.httpMethod = "POST"
+        req.setValue("application/json", forHTTPHeaderField: "Content-Type")
+        req.httpBody = bodyData
+        req.timeoutInterval = 300
+
+        log("Using resident daemon on port \(daemonPort) ...")
+        do {
+            let (wav, resp) = try await session.data(for: req)
+            guard let http = resp as? HTTPURLResponse, http.statusCode == 200, !wav.isEmpty else {
+                log("Daemon returned no audio; falling back to in-process load.")
+                return false
+            }
+            if let output {
+                try wav.write(to: URL(fileURLWithPath: output))
+                log("Saved \(wav.count) bytes to \(output)")
+            } else {
+                let tmp = FileManager.default.temporaryDirectory
+                    .appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
+                try wav.write(to: tmp)
+                defer { try? FileManager.default.removeItem(at: tmp) }
+                let proc = Process()
+                proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
+                proc.arguments = [tmp.path]
+                try proc.run()
+                proc.waitUntilExit()
+            }
+            return true
+        } catch {
+            log("Daemon request failed (\(error)); falling back to in-process load.")
+            return false
+        }
+    }
+
    private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws {
        let tmp = FileManager.default.temporaryDirectory
            .appendingPathComponent("madcat-say-\(UUID().uuidString).wav")