feat: gpu-check probe + resident TTS daemon (serve)

Add two subcommands and the deps they need, shipped as madcat-say 0.1.0. gpu-check (Sources/GpuCheck.swift) Queries the Metal device and runs a small MLX compute probe to verify the GPU pipeline + bundled mlx.metallib resolve before synth. serve (Sources/Serve.swift) Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads VoxCPM2 once behind an actor (serializes the single GPU), warms the pipeline at boot, caches the last reference voice. Routes: GET /health -> {status,model,ready,uptime_s} POST /v1/audio/speech {input|text, voice?, language?, timesteps?, cfg?, prepad?} -> audio/wav Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load). speak (Sources/MadcatSay.swift) Probes the daemon /health (0.6s) and forwards over HTTP when up; falls back to in-process load otherwise. New flags --daemon-port, --no-daemon. Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17 (HTTP only, no WebSocket — avoids the swift-websocket pin). Makefile: resolve the metallib via `swift build --show-bin-path` on both packages instead of the triple-prefixed/symlink path, which is not always present in speech-swift.
2026-06-10 21:04:19 +02:00
parent 67be877f7f
commit da04416ea4
5 changed files with 520 additions and 11 deletions
@@ -0,0 +1,177 @@
+import ArgumentParser
+import Dispatch
+import Foundation
+import MLX
+
+/// `madcat-say gpu-check` — confirm that MLX inference will run on the Apple
+/// Silicon Metal GPU rather than silently falling back to CPU, and that the
+/// compiled Metal shader library (`mlx.metallib`) is in place next to the
+/// binary.
+///
+/// What it checks:
+///   1. `GPU.deviceInfo()` reports an Apple GPU architecture (Metal present).
+///   2. `mlx.metallib` exists next to the executable. Missing => shaders are
+///      JIT-compiled (~5x slower), or inference aborts with
+///      "Failed to load the default metallib".
+///   3. A real matmul is forced to completion with `eval()`; the MLX
+///      active-memory delta proves the work was allocated on the Metal GPU.
+///
+/// Exit code: 0 = PASS (Apple Metal GPU is the compute device), 2 = FAIL.
+struct GpuCheck: AsyncParsableCommand {
+    static let configuration = CommandConfiguration(
+        commandName: "gpu-check",
+        abstract: "Confirm MLX is using the Apple Silicon Metal GPU (not CPU)."
+    )
+
+    @Flag(name: .long, help: "Emit machine-readable JSON instead of text.")
+    var json = false
+
+    @Option(name: .long, help: "Edge length N of the NxN matmul probe (default 1024).")
+    var probe: Int = 1024
+
+    func run() async throws {
+        // 1. Metal device identity. Only a Metal device answers deviceInfo();
+        //    on Apple Silicon the architecture string contains "apple".
+        let info = GPU.deviceInfo()
+        let architecture = info.architecture
+        let isAppleGPU = architecture.lowercased().contains("apple")
+        let totalMemory = info.memorySize
+        let maxWorkingSet = Int(info.maxRecommendedWorkingSetSize)
+
+        // 2. metallib presence (project convention: copied next to the binary).
+        let (metallibPath, metallibFound) = Self.metallibStatus()
+
+        // 3. Live GPU compute probe: matmul -> eval(). The cold eval pays the
+        //    Metal pipeline / metallib load; the warm eval is steady state.
+        let n = max(8, probe)
+        let activeBefore = Memory.activeMemory
+
+        let a = ones([n, n])
+        let b = ones([n, n])
+
+        let coldStart = DispatchTime.now()
+        var c = matmul(a, b)
+        eval(c)
+        let coldMs = Self.elapsedMs(since: coldStart)
+
+        let warmStart = DispatchTime.now()
+        c = matmul(c, b)
+        eval(c)
+        let warmMs = Self.elapsedMs(since: warmStart)
+
+        let checksum = c.sum().item(Float.self)
+        let memoryDelta = Memory.activeMemory - activeBefore
+        let pass = isAppleGPU
+
+        let report = Report(
+            appleGPU: isAppleGPU,
+            architecture: architecture,
+            totalMemory: totalMemory,
+            maxWorkingSet: maxWorkingSet,
+            metallibPath: metallibPath,
+            metallibFound: metallibFound,
+            probe: n,
+            coldMs: coldMs,
+            warmMs: warmMs,
+            checksum: checksum,
+            memoryDelta: memoryDelta,
+            pass: pass
+        )
+
+        print(json ? report.json : report.text)
+
+        if !pass {
+            throw ExitCode(2)
+        }
+    }
+
+    // MARK: - Helpers
+
+    /// MLX expects `mlx.metallib` next to the executable (see the Makefile,
+    /// which copies it out of speech-swift's build dir).
+    private static func metallibStatus() -> (path: String, found: Bool) {
+        let exe = Bundle.main.executableURL
+            ?? URL(fileURLWithPath: CommandLine.arguments.first ?? "madcat-say")
+        let dir = exe.resolvingSymlinksInPath().deletingLastPathComponent()
+        let candidate = dir.appendingPathComponent("mlx.metallib")
+        return (candidate.path, FileManager.default.fileExists(atPath: candidate.path))
+    }
+
+    private static func elapsedMs(since start: DispatchTime) -> Double {
+        Double(DispatchTime.now().uptimeNanoseconds &- start.uptimeNanoseconds) / 1_000_000
+    }
+
+    static func human(_ bytes: Int) -> String {
+        guard bytes > 0 else { return "0 B" }
+        let units = ["B", "KB", "MB", "GB", "TB"]
+        var value = Double(bytes)
+        var i = 0
+        while value >= 1024 && i < units.count - 1 {
+            value /= 1024
+            i += 1
+        }
+        return String(format: "%.1f %@", value, units[i])
+    }
+}
+
+/// Result of a `gpu-check` run, with text + JSON renderings.
+private struct Report {
+    let appleGPU: Bool
+    let architecture: String
+    let totalMemory: Int
+    let maxWorkingSet: Int
+    let metallibPath: String
+    let metallibFound: Bool
+    let probe: Int
+    let coldMs: Double
+    let warmMs: Double
+    let checksum: Float
+    let memoryDelta: Int
+    let pass: Bool
+
+    var text: String {
+        var lines: [String] = []
+        lines.append("madcat-say gpu-check")
+        lines.append("--------------------")
+        lines.append("Metal GPU       : \(architecture)  (Apple Silicon: \(appleGPU ? "YES" : "NO"))")
+        lines.append("Total memory    : \(GpuCheck.human(totalMemory))")
+        lines.append("Max working set : \(GpuCheck.human(maxWorkingSet))")
+        lines.append("MLX metallib    : \(metallibFound ? "found" : "MISSING")  (\(metallibPath))")
+        if !metallibFound {
+            lines.append("                  WARNING: shaders will JIT-compile (~5x slower) or fail to load.")
+            lines.append("                  Fix: run `make` (copies speech-swift's mlx.metallib next to the binary).")
+        }
+        lines.append("Compute probe   : \(probe)x\(probe) matmul on GPU")
+        lines.append(String(format: "  cold eval     : %.2f ms   (Metal pipeline / metallib load)", coldMs))
+        lines.append(String(format: "  warm eval     : %.2f ms", warmMs))
+        lines.append("  GPU mem delta : \(GpuCheck.human(max(0, memoryDelta))) active")
+        lines.append(String(format: "  checksum      : %.0f", checksum))
+        lines.append("VERDICT: " + (pass
+            ? "PASS - MLX inference will run on the Metal GPU."
+            : "FAIL - no Apple Metal GPU detected; inference would fall back to CPU."))
+        return lines.joined(separator: "\n")
+    }
+
+    var json: String {
+        let payload: [String: Any] = [
+            "apple_gpu": appleGPU,
+            "architecture": architecture,
+            "total_memory_bytes": totalMemory,
+            "max_recommended_working_set_bytes": maxWorkingSet,
+            "metallib_path": metallibPath,
+            "metallib_found": metallibFound,
+            "probe_dim": probe,
+            "cold_eval_ms": coldMs,
+            "warm_eval_ms": warmMs,
+            "active_memory_delta_bytes": memoryDelta,
+            "checksum": Double(checksum),
+            "verdict": pass ? "PASS" : "FAIL"
+        ]
+        guard let data = try? JSONSerialization.data(
+            withJSONObject: payload, options: [.prettyPrinted, .sortedKeys]
+        ) else {
+            return "{\"verdict\":\"\(pass ? "PASS" : "FAIL")\"}"
+        }
+        return String(decoding: data, as: UTF8.self)
+    }
+}
@@ -3,16 +3,30 @@ import AudioCommon
 import Foundation
@preconcurrency import VoxCPM2TTS

-/// madcat-say — speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
+/// madcat-say — on-device voice-cloning TTS using VoxCPM2 (MLX/Metal).
+///
+/// Root command. `speak` is the default subcommand, so the historical
+/// `madcat-say "text"` form keeps working unchanged; `madcat-say gpu-check`
+/// runs the Metal/GPU diagnostic.
+@main
+struct MadcatSay: AsyncParsableCommand {
+    static let configuration = CommandConfiguration(
+        commandName: "madcat-say",
+        abstract: "On-device voice cloning with VoxCPM2 (MLX/Metal).",
+        subcommands: [Speak.self, GpuCheck.self, Serve.self],
+        defaultSubcommand: Speak.self
+    )
+}
+
+/// Speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
 ///
 ///   madcat-say "Hello there"
 ///   madcat-say -r samantha.wav "I was calibrated just for you."
 ///   madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha."
 ///   madcat-say -r samantha.wav -o out.wav "Saved instead of played."
-@main
-struct MadcatSay: AsyncParsableCommand {
+struct Speak: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
-        commandName: "madcat-say",
+        commandName: "speak",
        abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).",
        discussion: """
        With no -r, uses the model's default voice. With -r it zero-shot clones
@@ -46,7 +60,19 @@ struct MadcatSay: AsyncParsableCommand {
    @Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).")
    var prepad: Double = 0.1

+    @Option(name: .long, help: "Resident daemon port to try first (default 8765).")
+    var daemonPort: Int = 8765
+
+    @Flag(name: .long, help: "Bypass the resident daemon; always load the model in-process.")
+    var noDaemon: Bool = false
+
    func run() async throws {
+        // Fast path: forward to the resident `serve` daemon if it is up, so the
+        // model stays warm in GPU memory instead of being cold-loaded per call.
+        if !noDaemon, await trySpeakViaDaemon() {
+            return
+        }
+
        let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
        log("Loading \(modelId) ...")
        let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
@@ -93,6 +119,68 @@ struct MadcatSay: AsyncParsableCommand {

    // MARK: - Helpers

+    /// Try to satisfy this request via the resident `serve` daemon. Returns true
+    /// if the daemon handled it (audio played or saved); false on any failure so
+    /// the caller falls back to an in-process model load. Never throws — a down
+    /// or broken daemon must degrade gracefully, not abort the command.
+    private func trySpeakViaDaemon() async -> Bool {
+        let base = "http://127.0.0.1:\(daemonPort)"
+        let session = URLSession(configuration: .ephemeral)
+
+        // Health probe with a short timeout so a missing daemon costs ~nothing.
+        guard let healthURL = URL(string: base + "/health") else { return false }
+        var healthReq = URLRequest(url: healthURL)
+        healthReq.timeoutInterval = 0.6
+        do {
+            let (_, resp) = try await session.data(for: healthReq)
+            guard let http = resp as? HTTPURLResponse, http.statusCode == 200 else { return false }
+        } catch {
+            return false
+        }
+
+        // Build the synthesis request.
+        var payload: [String: Any] = ["input": text, "timesteps": timesteps, "cfg": Double(cfg), "prepad": prepad]
+        if let language { payload["language"] = language }
+        if let reference { payload["voice"] = (reference as NSString).expandingTildeInPath }
+
+        guard let speakURL = URL(string: base + "/v1/audio/speech"),
+              let bodyData = try? JSONSerialization.data(withJSONObject: payload) else {
+            return false
+        }
+        var req = URLRequest(url: speakURL)
+        req.httpMethod = "POST"
+        req.setValue("application/json", forHTTPHeaderField: "Content-Type")
+        req.httpBody = bodyData
+        req.timeoutInterval = 300
+
+        log("Using resident daemon on port \(daemonPort) ...")
+        do {
+            let (wav, resp) = try await session.data(for: req)
+            guard let http = resp as? HTTPURLResponse, http.statusCode == 200, !wav.isEmpty else {
+                log("Daemon returned no audio; falling back to in-process load.")
+                return false
+            }
+            if let output {
+                try wav.write(to: URL(fileURLWithPath: output))
+                log("Saved \(wav.count) bytes to \(output)")
+            } else {
+                let tmp = FileManager.default.temporaryDirectory
+                    .appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
+                try wav.write(to: tmp)
+                defer { try? FileManager.default.removeItem(at: tmp) }
+                let proc = Process()
+                proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
+                proc.arguments = [tmp.path]
+                try proc.run()
+                proc.waitUntilExit()
+            }
+            return true
+        } catch {
+            log("Daemon request failed (\(error)); falling back to in-process load.")
+            return false
+        }
+    }
+
    private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws {
        let tmp = FileManager.default.temporaryDirectory
            .appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
@@ -0,0 +1,231 @@
+import ArgumentParser
+import AudioCommon
+import Foundation
+import Hummingbird
+import NIOCore
+@preconcurrency import VoxCPM2TTS
+
+/// `madcat-say serve` — resident VoxCPM2 TTS daemon.
+///
+/// Loads the model once, warms the Metal pipeline, and keeps it resident in GPU
+/// memory, serving synthesis over a loopback HTTP API. This pays the multi-GB
+/// cold model load exactly once (at startup) instead of on every `madcat-say`
+/// invocation. The `speak` subcommand probes this daemon's `/health` and
+/// forwards to it when up, falling back to an in-process load when it is not.
+struct Serve: AsyncParsableCommand {
+    static let configuration = CommandConfiguration(
+        commandName: "serve",
+        abstract: "Run the resident VoxCPM2 TTS daemon (load once, stay warm)."
+    )
+
+    @Option(name: .long, help: "Host to bind (default: 127.0.0.1).")
+    var host: String = "127.0.0.1"
+
+    @Option(name: .long, help: "Port to bind (default: 8765).")
+    var port: Int = 8765
+
+    @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.")
+    var variant: String = "int4"
+
+    @Option(name: .long, help: "Default diffusion timesteps per patch (default 10).")
+    var timesteps: Int = 10
+
+    @Option(name: .long, help: "Default classifier-free guidance scale (default 2.0).")
+    var cfg: Float = 2.0
+
+    @Flag(name: .long, help: "Skip the warmup synthesis at startup.")
+    var noWarmup: Bool = false
+
+    func run() async throws {
+        let engine = SynthEngine(variant: variant, defaultTimesteps: timesteps, defaultCfg: cfg)
+        let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
+
+        log("Loading \(modelId) ...")
+        try await engine.load()
+        if !noWarmup {
+            log("Warming up GPU pipeline ...")
+            try await engine.warmup()
+        }
+        log("Model resident. Serving on http://\(host):\(port)")
+        log("  GET  /health")
+        log("  POST /v1/audio/speech   {input|text, voice?, language?, timesteps?, cfg?, prepad?}")
+
+        let started = Date()
+        let router = Router()
+
+        router.get("/health") { _, _ -> Response in
+            let ready = await engine.isReady
+            let uptime = Int(Date().timeIntervalSince(started))
+            let json = "{\"status\":\"\(ready ? "ok" : "loading")\",\"model\":\"\(modelId)\",\"ready\":\(ready),\"uptime_s\":\(uptime)}"
+            return Response(
+                status: ready ? .ok : .serviceUnavailable,
+                headers: [.contentType: "application/json"],
+                body: .init(byteBuffer: .init(string: json)))
+        }
+
+        router.post("/v1/audio/speech") { request, _ -> Response in
+            let body = try await request.body.collect(upTo: 1 << 20)
+            let req = SpeechRequest.parse(body)
+            guard let text = req.text, !text.isEmpty else {
+                return Self.errorResponse("Missing 'input' (or 'text') field.", status: .badRequest)
+            }
+            do {
+                let result = try await engine.synthesize(
+                    text: text,
+                    language: req.language,
+                    referencePath: req.voice,
+                    timesteps: req.timesteps,
+                    cfg: req.cfg,
+                    prepad: req.prepad ?? 0.1)
+                let wav = try Self.encodeWAV(samples: result.samples, sampleRate: result.sampleRate)
+                return Response(
+                    status: .ok,
+                    headers: [.contentType: "audio/wav"],
+                    body: .init(byteBuffer: .init(data: wav)))
+            } catch {
+                return Self.errorResponse("Synthesis failed: \(error)", status: .internalServerError)
+            }
+        }
+
+        let app = Application(
+            router: router,
+            configuration: .init(address: .hostname(host, port: port)))
+        try await app.run()
+    }
+
+    // MARK: - Helpers
+
+    static func encodeWAV(samples: [Float], sampleRate: Int) throws -> Data {
+        let tmp = FileManager.default.temporaryDirectory
+            .appendingPathComponent("madcat-say-serve-\(UUID().uuidString).wav")
+        try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp)
+        defer { try? FileManager.default.removeItem(at: tmp) }
+        return try Data(contentsOf: tmp)
+    }
+
+    static func errorResponse(_ message: String, status: HTTPResponse.Status) -> Response {
+        let data = (try? JSONSerialization.data(withJSONObject: ["error": message])) ?? Data()
+        return Response(
+            status: status,
+            headers: [.contentType: "application/json"],
+            body: .init(byteBuffer: .init(data: data)))
+    }
+
+    func log(_ message: String) {
+        FileHandle.standardError.write(Data((message + "\n").utf8))
+    }
+}
+
+// MARK: - Request model
+
+/// Parsed `/v1/audio/speech` request. OpenAI-ish: `input` is the text, `voice`
+/// is a reference WAV path (loopback / same host) for zero-shot cloning.
+struct SpeechRequest {
+    var text: String?
+    var voice: String?
+    var language: String?
+    var timesteps: Int?
+    var cfg: Float?
+    var prepad: Double?
+
+    static func parse(_ body: ByteBuffer) -> SpeechRequest {
+        var req = SpeechRequest()
+        let data = Data(buffer: body)
+        guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
+            return req
+        }
+        req.text = (json["input"] as? String) ?? (json["text"] as? String)
+        req.voice = json["voice"] as? String
+        req.language = json["language"] as? String
+        req.timesteps = json["timesteps"] as? Int
+        if let c = json["cfg"] as? Double { req.cfg = Float(c) }
+        req.prepad = json["prepad"] as? Double
+        return req
+    }
+}
+
+// MARK: - Resident synthesis engine
+
+/// Holds the loaded model and serializes single-GPU access. Caches the most
+/// recently used reference voice so repeated clone calls skip the re-decode.
+actor SynthEngine {
+    private var model: VoxCPM2TTSModel?
+    private let modelId: String
+    private let defaultTimesteps: Int
+    private let defaultCfg: Float
+    private var cachedReferencePath: String?
+    private var cachedReference: [Float]?
+
+    init(variant: String, defaultTimesteps: Int, defaultCfg: Float) {
+        self.modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
+        self.defaultTimesteps = defaultTimesteps
+        self.defaultCfg = defaultCfg
+    }
+
+    var isReady: Bool { model != nil }
+
+    func load() async throws {
+        if model != nil { return }
+        let m = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
+            FileHandle.standardError.write(Data("  [\(Int(progress * 100))%] \(status)\n".utf8))
+        }
+        model = m
+    }
+
+    func warmup() async throws {
+        guard let m = model else { return }
+        _ = try await m.generateVoxCPM2(
+            text: "Warming up.",
+            language: nil,
+            refAudio: nil,
+            inferenceTimesteps: defaultTimesteps,
+            cfgValue: defaultCfg)
+    }
+
+    struct Result { let samples: [Float]; let sampleRate: Int }
+
+    func synthesize(
+        text: String,
+        language: String?,
+        referencePath: String?,
+        timesteps: Int?,
+        cfg: Float?,
+        prepad: Double
+    ) async throws -> Result {
+        if model == nil { try await load() }
+        guard let m = model else {
+            throw ValidationError("Model failed to load.")
+        }
+
+        var refAudio: [Float]?
+        if let path = referencePath, !path.isEmpty {
+            if path == cachedReferencePath, let cached = cachedReference {
+                refAudio = cached
+            } else {
+                let url = URL(fileURLWithPath: (path as NSString).expandingTildeInPath)
+                guard FileManager.default.fileExists(atPath: url.path) else {
+                    throw ValidationError("Reference file not found: \(path)")
+                }
+                let loaded = try AudioFileLoader.load(url: url, targetSampleRate: 16000)
+                cachedReferencePath = path
+                cachedReference = loaded
+                refAudio = loaded
+            }
+        }
+
+        let audio = try await m.generateVoxCPM2(
+            text: text,
+            language: language,
+            refAudio: refAudio,
+            inferenceTimesteps: timesteps ?? defaultTimesteps,
+            cfgValue: cfg ?? defaultCfg)
+        guard !audio.isEmpty else {
+            throw ValidationError("No audio was generated.")
+        }
+
+        let rate = m.sampleRate
+        let padSamples = max(0, Int(prepad * Double(rate)))
+        let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio
+        return Result(samples: out, sampleRate: rate)
+    }
+}