feat: gpu-check probe + resident TTS daemon (serve)

Add two subcommands and the deps they need, shipped as madcat-say 0.1.0. gpu-check (Sources/GpuCheck.swift) Queries the Metal device and runs a small MLX compute probe to verify the GPU pipeline + bundled mlx.metallib resolve before synth. serve (Sources/Serve.swift) Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads VoxCPM2 once behind an actor (serializes the single GPU), warms the pipeline at boot, caches the last reference voice. Routes: GET /health -> {status,model,ready,uptime_s} POST /v1/audio/speech {input|text, voice?, language?, timesteps?, cfg?, prepad?} -> audio/wav Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load). speak (Sources/MadcatSay.swift) Probes the daemon /health (0.6s) and forwards over HTTP when up; falls back to in-process load otherwise. New flags --daemon-port, --no-daemon. Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17 (HTTP only, no WebSocket — avoids the swift-websocket pin). Makefile: resolve the metallib via `swift build --show-bin-path` on both packages instead of the triple-prefixed/symlink path, which is not always present in speech-swift.
2026-06-10 21:04:19 +02:00
parent 67be877f7f
commit da04416ea4
5 changed files with 520 additions and 11 deletions
@@ -0,0 +1,231 @@
+import ArgumentParser
+import AudioCommon
+import Foundation
+import Hummingbird
+import NIOCore
+@preconcurrency import VoxCPM2TTS
+
+/// `madcat-say serve` — resident VoxCPM2 TTS daemon.
+///
+/// Loads the model once, warms the Metal pipeline, and keeps it resident in GPU
+/// memory, serving synthesis over a loopback HTTP API. This pays the multi-GB
+/// cold model load exactly once (at startup) instead of on every `madcat-say`
+/// invocation. The `speak` subcommand probes this daemon's `/health` and
+/// forwards to it when up, falling back to an in-process load when it is not.
+struct Serve: AsyncParsableCommand {
+    static let configuration = CommandConfiguration(
+        commandName: "serve",
+        abstract: "Run the resident VoxCPM2 TTS daemon (load once, stay warm)."
+    )
+
+    @Option(name: .long, help: "Host to bind (default: 127.0.0.1).")
+    var host: String = "127.0.0.1"
+
+    @Option(name: .long, help: "Port to bind (default: 8765).")
+    var port: Int = 8765
+
+    @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.")
+    var variant: String = "int4"
+
+    @Option(name: .long, help: "Default diffusion timesteps per patch (default 10).")
+    var timesteps: Int = 10
+
+    @Option(name: .long, help: "Default classifier-free guidance scale (default 2.0).")
+    var cfg: Float = 2.0
+
+    @Flag(name: .long, help: "Skip the warmup synthesis at startup.")
+    var noWarmup: Bool = false
+
+    func run() async throws {
+        let engine = SynthEngine(variant: variant, defaultTimesteps: timesteps, defaultCfg: cfg)
+        let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
+
+        log("Loading \(modelId) ...")
+        try await engine.load()
+        if !noWarmup {
+            log("Warming up GPU pipeline ...")
+            try await engine.warmup()
+        }
+        log("Model resident. Serving on http://\(host):\(port)")
+        log("  GET  /health")
+        log("  POST /v1/audio/speech   {input|text, voice?, language?, timesteps?, cfg?, prepad?}")
+
+        let started = Date()
+        let router = Router()
+
+        router.get("/health") { _, _ -> Response in
+            let ready = await engine.isReady
+            let uptime = Int(Date().timeIntervalSince(started))
+            let json = "{\"status\":\"\(ready ? "ok" : "loading")\",\"model\":\"\(modelId)\",\"ready\":\(ready),\"uptime_s\":\(uptime)}"
+            return Response(
+                status: ready ? .ok : .serviceUnavailable,
+                headers: [.contentType: "application/json"],
+                body: .init(byteBuffer: .init(string: json)))
+        }
+
+        router.post("/v1/audio/speech") { request, _ -> Response in
+            let body = try await request.body.collect(upTo: 1 << 20)
+            let req = SpeechRequest.parse(body)
+            guard let text = req.text, !text.isEmpty else {
+                return Self.errorResponse("Missing 'input' (or 'text') field.", status: .badRequest)
+            }
+            do {
+                let result = try await engine.synthesize(
+                    text: text,
+                    language: req.language,
+                    referencePath: req.voice,
+                    timesteps: req.timesteps,
+                    cfg: req.cfg,
+                    prepad: req.prepad ?? 0.1)
+                let wav = try Self.encodeWAV(samples: result.samples, sampleRate: result.sampleRate)
+                return Response(
+                    status: .ok,
+                    headers: [.contentType: "audio/wav"],
+                    body: .init(byteBuffer: .init(data: wav)))
+            } catch {
+                return Self.errorResponse("Synthesis failed: \(error)", status: .internalServerError)
+            }
+        }
+
+        let app = Application(
+            router: router,
+            configuration: .init(address: .hostname(host, port: port)))
+        try await app.run()
+    }
+
+    // MARK: - Helpers
+
+    static func encodeWAV(samples: [Float], sampleRate: Int) throws -> Data {
+        let tmp = FileManager.default.temporaryDirectory
+            .appendingPathComponent("madcat-say-serve-\(UUID().uuidString).wav")
+        try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp)
+        defer { try? FileManager.default.removeItem(at: tmp) }
+        return try Data(contentsOf: tmp)
+    }
+
+    static func errorResponse(_ message: String, status: HTTPResponse.Status) -> Response {
+        let data = (try? JSONSerialization.data(withJSONObject: ["error": message])) ?? Data()
+        return Response(
+            status: status,
+            headers: [.contentType: "application/json"],
+            body: .init(byteBuffer: .init(data: data)))
+    }
+
+    func log(_ message: String) {
+        FileHandle.standardError.write(Data((message + "\n").utf8))
+    }
+}
+
+// MARK: - Request model
+
+/// Parsed `/v1/audio/speech` request. OpenAI-ish: `input` is the text, `voice`
+/// is a reference WAV path (loopback / same host) for zero-shot cloning.
+struct SpeechRequest {
+    var text: String?
+    var voice: String?
+    var language: String?
+    var timesteps: Int?
+    var cfg: Float?
+    var prepad: Double?
+
+    static func parse(_ body: ByteBuffer) -> SpeechRequest {
+        var req = SpeechRequest()
+        let data = Data(buffer: body)
+        guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
+            return req
+        }
+        req.text = (json["input"] as? String) ?? (json["text"] as? String)
+        req.voice = json["voice"] as? String
+        req.language = json["language"] as? String
+        req.timesteps = json["timesteps"] as? Int
+        if let c = json["cfg"] as? Double { req.cfg = Float(c) }
+        req.prepad = json["prepad"] as? Double
+        return req
+    }
+}
+
+// MARK: - Resident synthesis engine
+
+/// Holds the loaded model and serializes single-GPU access. Caches the most
+/// recently used reference voice so repeated clone calls skip the re-decode.
+actor SynthEngine {
+    private var model: VoxCPM2TTSModel?
+    private let modelId: String
+    private let defaultTimesteps: Int
+    private let defaultCfg: Float
+    private var cachedReferencePath: String?
+    private var cachedReference: [Float]?
+
+    init(variant: String, defaultTimesteps: Int, defaultCfg: Float) {
+        self.modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
+        self.defaultTimesteps = defaultTimesteps
+        self.defaultCfg = defaultCfg
+    }
+
+    var isReady: Bool { model != nil }
+
+    func load() async throws {
+        if model != nil { return }
+        let m = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
+            FileHandle.standardError.write(Data("  [\(Int(progress * 100))%] \(status)\n".utf8))
+        }
+        model = m
+    }
+
+    func warmup() async throws {
+        guard let m = model else { return }
+        _ = try await m.generateVoxCPM2(
+            text: "Warming up.",
+            language: nil,
+            refAudio: nil,
+            inferenceTimesteps: defaultTimesteps,
+            cfgValue: defaultCfg)
+    }
+
+    struct Result { let samples: [Float]; let sampleRate: Int }
+
+    func synthesize(
+        text: String,
+        language: String?,
+        referencePath: String?,
+        timesteps: Int?,
+        cfg: Float?,
+        prepad: Double
+    ) async throws -> Result {
+        if model == nil { try await load() }
+        guard let m = model else {
+            throw ValidationError("Model failed to load.")
+        }
+
+        var refAudio: [Float]?
+        if let path = referencePath, !path.isEmpty {
+            if path == cachedReferencePath, let cached = cachedReference {
+                refAudio = cached
+            } else {
+                let url = URL(fileURLWithPath: (path as NSString).expandingTildeInPath)
+                guard FileManager.default.fileExists(atPath: url.path) else {
+                    throw ValidationError("Reference file not found: \(path)")
+                }
+                let loaded = try AudioFileLoader.load(url: url, targetSampleRate: 16000)
+                cachedReferencePath = path
+                cachedReference = loaded
+                refAudio = loaded
+            }
+        }
+
+        let audio = try await m.generateVoxCPM2(
+            text: text,
+            language: language,
+            refAudio: refAudio,
+            inferenceTimesteps: timesteps ?? defaultTimesteps,
+            cfgValue: cfg ?? defaultCfg)
+        guard !audio.isEmpty else {
+            throw ValidationError("No audio was generated.")
+        }
+
+        let rate = m.sampleRate
+        let padSamples = max(0, Int(prepad * Double(rate)))
+        let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio
+        return Result(samples: out, sampleRate: rate)
+    }
+}