feat: gpu-check probe + resident TTS daemon (serve)

Add two subcommands and the deps they need, shipped as madcat-say 0.1.0. gpu-check (Sources/GpuCheck.swift) Queries the Metal device and runs a small MLX compute probe to verify the GPU pipeline + bundled mlx.metallib resolve before synth. serve (Sources/Serve.swift) Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads VoxCPM2 once behind an actor (serializes the single GPU), warms the pipeline at boot, caches the last reference voice. Routes: GET /health -> {status,model,ready,uptime_s} POST /v1/audio/speech {input|text, voice?, language?, timesteps?, cfg?, prepad?} -> audio/wav Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load). speak (Sources/MadcatSay.swift) Probes the daemon /health (0.6s) and forwards over HTTP when up; falls back to in-process load otherwise. New flags --daemon-port, --no-daemon. Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17 (HTTP only, no WebSocket — avoids the swift-websocket pin). Makefile: resolve the metallib via `swift build --show-bin-path` on both packages instead of the triple-prefixed/symlink path, which is not always present in speech-swift.
2026-06-10 21:04:19 +02:00
parent 67be877f7f
commit da04416ea4
5 changed files with 520 additions and 11 deletions
@@ -0,0 +1,177 @@
+import ArgumentParser
+import Dispatch
+import Foundation
+import MLX
+
+/// `madcat-say gpu-check` — confirm that MLX inference will run on the Apple
+/// Silicon Metal GPU rather than silently falling back to CPU, and that the
+/// compiled Metal shader library (`mlx.metallib`) is in place next to the
+/// binary.
+///
+/// What it checks:
+///   1. `GPU.deviceInfo()` reports an Apple GPU architecture (Metal present).
+///   2. `mlx.metallib` exists next to the executable. Missing => shaders are
+///      JIT-compiled (~5x slower), or inference aborts with
+///      "Failed to load the default metallib".
+///   3. A real matmul is forced to completion with `eval()`; the MLX
+///      active-memory delta proves the work was allocated on the Metal GPU.
+///
+/// Exit code: 0 = PASS (Apple Metal GPU is the compute device), 2 = FAIL.
+struct GpuCheck: AsyncParsableCommand {
+    static let configuration = CommandConfiguration(
+        commandName: "gpu-check",
+        abstract: "Confirm MLX is using the Apple Silicon Metal GPU (not CPU)."
+    )
+
+    @Flag(name: .long, help: "Emit machine-readable JSON instead of text.")
+    var json = false
+
+    @Option(name: .long, help: "Edge length N of the NxN matmul probe (default 1024).")
+    var probe: Int = 1024
+
+    func run() async throws {
+        // 1. Metal device identity. Only a Metal device answers deviceInfo();
+        //    on Apple Silicon the architecture string contains "apple".
+        let info = GPU.deviceInfo()
+        let architecture = info.architecture
+        let isAppleGPU = architecture.lowercased().contains("apple")
+        let totalMemory = info.memorySize
+        let maxWorkingSet = Int(info.maxRecommendedWorkingSetSize)
+
+        // 2. metallib presence (project convention: copied next to the binary).
+        let (metallibPath, metallibFound) = Self.metallibStatus()
+
+        // 3. Live GPU compute probe: matmul -> eval(). The cold eval pays the
+        //    Metal pipeline / metallib load; the warm eval is steady state.
+        let n = max(8, probe)
+        let activeBefore = Memory.activeMemory
+
+        let a = ones([n, n])
+        let b = ones([n, n])
+
+        let coldStart = DispatchTime.now()
+        var c = matmul(a, b)
+        eval(c)
+        let coldMs = Self.elapsedMs(since: coldStart)
+
+        let warmStart = DispatchTime.now()
+        c = matmul(c, b)
+        eval(c)
+        let warmMs = Self.elapsedMs(since: warmStart)
+
+        let checksum = c.sum().item(Float.self)
+        let memoryDelta = Memory.activeMemory - activeBefore
+        let pass = isAppleGPU
+
+        let report = Report(
+            appleGPU: isAppleGPU,
+            architecture: architecture,
+            totalMemory: totalMemory,
+            maxWorkingSet: maxWorkingSet,
+            metallibPath: metallibPath,
+            metallibFound: metallibFound,
+            probe: n,
+            coldMs: coldMs,
+            warmMs: warmMs,
+            checksum: checksum,
+            memoryDelta: memoryDelta,
+            pass: pass
+        )
+
+        print(json ? report.json : report.text)
+
+        if !pass {
+            throw ExitCode(2)
+        }
+    }
+
+    // MARK: - Helpers
+
+    /// MLX expects `mlx.metallib` next to the executable (see the Makefile,
+    /// which copies it out of speech-swift's build dir).
+    private static func metallibStatus() -> (path: String, found: Bool) {
+        let exe = Bundle.main.executableURL
+            ?? URL(fileURLWithPath: CommandLine.arguments.first ?? "madcat-say")
+        let dir = exe.resolvingSymlinksInPath().deletingLastPathComponent()
+        let candidate = dir.appendingPathComponent("mlx.metallib")
+        return (candidate.path, FileManager.default.fileExists(atPath: candidate.path))
+    }
+
+    private static func elapsedMs(since start: DispatchTime) -> Double {
+        Double(DispatchTime.now().uptimeNanoseconds &- start.uptimeNanoseconds) / 1_000_000
+    }
+
+    static func human(_ bytes: Int) -> String {
+        guard bytes > 0 else { return "0 B" }
+        let units = ["B", "KB", "MB", "GB", "TB"]
+        var value = Double(bytes)
+        var i = 0
+        while value >= 1024 && i < units.count - 1 {
+            value /= 1024
+            i += 1
+        }
+        return String(format: "%.1f %@", value, units[i])
+    }
+}
+
+/// Result of a `gpu-check` run, with text + JSON renderings.
+private struct Report {
+    let appleGPU: Bool
+    let architecture: String
+    let totalMemory: Int
+    let maxWorkingSet: Int
+    let metallibPath: String
+    let metallibFound: Bool
+    let probe: Int
+    let coldMs: Double
+    let warmMs: Double
+    let checksum: Float
+    let memoryDelta: Int
+    let pass: Bool
+
+    var text: String {
+        var lines: [String] = []
+        lines.append("madcat-say gpu-check")
+        lines.append("--------------------")
+        lines.append("Metal GPU       : \(architecture)  (Apple Silicon: \(appleGPU ? "YES" : "NO"))")
+        lines.append("Total memory    : \(GpuCheck.human(totalMemory))")
+        lines.append("Max working set : \(GpuCheck.human(maxWorkingSet))")
+        lines.append("MLX metallib    : \(metallibFound ? "found" : "MISSING")  (\(metallibPath))")
+        if !metallibFound {
+            lines.append("                  WARNING: shaders will JIT-compile (~5x slower) or fail to load.")
+            lines.append("                  Fix: run `make` (copies speech-swift's mlx.metallib next to the binary).")
+        }
+        lines.append("Compute probe   : \(probe)x\(probe) matmul on GPU")
+        lines.append(String(format: "  cold eval     : %.2f ms   (Metal pipeline / metallib load)", coldMs))
+        lines.append(String(format: "  warm eval     : %.2f ms", warmMs))
+        lines.append("  GPU mem delta : \(GpuCheck.human(max(0, memoryDelta))) active")
+        lines.append(String(format: "  checksum      : %.0f", checksum))
+        lines.append("VERDICT: " + (pass
+            ? "PASS - MLX inference will run on the Metal GPU."
+            : "FAIL - no Apple Metal GPU detected; inference would fall back to CPU."))
+        return lines.joined(separator: "\n")
+    }
+
+    var json: String {
+        let payload: [String: Any] = [
+            "apple_gpu": appleGPU,
+            "architecture": architecture,
+            "total_memory_bytes": totalMemory,
+            "max_recommended_working_set_bytes": maxWorkingSet,
+            "metallib_path": metallibPath,
+            "metallib_found": metallibFound,
+            "probe_dim": probe,
+            "cold_eval_ms": coldMs,
+            "warm_eval_ms": warmMs,
+            "active_memory_delta_bytes": memoryDelta,
+            "checksum": Double(checksum),
+            "verdict": pass ? "PASS" : "FAIL"
+        ]
+        guard let data = try? JSONSerialization.data(
+            withJSONObject: payload, options: [.prettyPrinted, .sortedKeys]
+        ) else {
+            return "{\"verdict\":\"\(pass ? "PASS" : "FAIL")\"}"
+        }
+        return String(decoding: data, as: UTF8.self)
+    }
+}