Files
madcat-say/Sources/GpuCheck.swift
T
madcat-core da04416ea4 feat: gpu-check probe + resident TTS daemon (serve)
Add two subcommands and the deps they need, shipped as madcat-say 0.1.0.

gpu-check (Sources/GpuCheck.swift)
  Queries the Metal device and runs a small MLX compute probe to verify
  the GPU pipeline + bundled mlx.metallib resolve before synth.

serve (Sources/Serve.swift)
  Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads
  VoxCPM2 once behind an actor (serializes the single GPU), warms the
  pipeline at boot, caches the last reference voice. Routes:
    GET  /health            -> {status,model,ready,uptime_s}
    POST /v1/audio/speech   {input|text, voice?, language?, timesteps?,
                             cfg?, prepad?} -> audio/wav
  Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load).

speak (Sources/MadcatSay.swift)
  Probes the daemon /health (0.6s) and forwards over HTTP when up;
  falls back to in-process load otherwise. New flags --daemon-port,
  --no-daemon.

Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17
  (HTTP only, no WebSocket — avoids the swift-websocket pin).

Makefile: resolve the metallib via `swift build --show-bin-path` on
  both packages instead of the triple-prefixed/symlink path, which is
  not always present in speech-swift.
2026-06-10 21:04:19 +02:00

178 lines
6.8 KiB
Swift

import ArgumentParser
import Dispatch
import Foundation
import MLX
/// `madcat-say gpu-check` confirm that MLX inference will run on the Apple
/// Silicon Metal GPU rather than silently falling back to CPU, and that the
/// compiled Metal shader library (`mlx.metallib`) is in place next to the
/// binary.
///
/// What it checks:
/// 1. `GPU.deviceInfo()` reports an Apple GPU architecture (Metal present).
/// 2. `mlx.metallib` exists next to the executable. Missing => shaders are
/// JIT-compiled (~5x slower), or inference aborts with
/// "Failed to load the default metallib".
/// 3. A real matmul is forced to completion with `eval()`; the MLX
/// active-memory delta proves the work was allocated on the Metal GPU.
///
/// Exit code: 0 = PASS (Apple Metal GPU is the compute device), 2 = FAIL.
struct GpuCheck: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "gpu-check",
abstract: "Confirm MLX is using the Apple Silicon Metal GPU (not CPU)."
)
@Flag(name: .long, help: "Emit machine-readable JSON instead of text.")
var json = false
@Option(name: .long, help: "Edge length N of the NxN matmul probe (default 1024).")
var probe: Int = 1024
func run() async throws {
// 1. Metal device identity. Only a Metal device answers deviceInfo();
// on Apple Silicon the architecture string contains "apple".
let info = GPU.deviceInfo()
let architecture = info.architecture
let isAppleGPU = architecture.lowercased().contains("apple")
let totalMemory = info.memorySize
let maxWorkingSet = Int(info.maxRecommendedWorkingSetSize)
// 2. metallib presence (project convention: copied next to the binary).
let (metallibPath, metallibFound) = Self.metallibStatus()
// 3. Live GPU compute probe: matmul -> eval(). The cold eval pays the
// Metal pipeline / metallib load; the warm eval is steady state.
let n = max(8, probe)
let activeBefore = Memory.activeMemory
let a = ones([n, n])
let b = ones([n, n])
let coldStart = DispatchTime.now()
var c = matmul(a, b)
eval(c)
let coldMs = Self.elapsedMs(since: coldStart)
let warmStart = DispatchTime.now()
c = matmul(c, b)
eval(c)
let warmMs = Self.elapsedMs(since: warmStart)
let checksum = c.sum().item(Float.self)
let memoryDelta = Memory.activeMemory - activeBefore
let pass = isAppleGPU
let report = Report(
appleGPU: isAppleGPU,
architecture: architecture,
totalMemory: totalMemory,
maxWorkingSet: maxWorkingSet,
metallibPath: metallibPath,
metallibFound: metallibFound,
probe: n,
coldMs: coldMs,
warmMs: warmMs,
checksum: checksum,
memoryDelta: memoryDelta,
pass: pass
)
print(json ? report.json : report.text)
if !pass {
throw ExitCode(2)
}
}
// MARK: - Helpers
/// MLX expects `mlx.metallib` next to the executable (see the Makefile,
/// which copies it out of speech-swift's build dir).
private static func metallibStatus() -> (path: String, found: Bool) {
let exe = Bundle.main.executableURL
?? URL(fileURLWithPath: CommandLine.arguments.first ?? "madcat-say")
let dir = exe.resolvingSymlinksInPath().deletingLastPathComponent()
let candidate = dir.appendingPathComponent("mlx.metallib")
return (candidate.path, FileManager.default.fileExists(atPath: candidate.path))
}
private static func elapsedMs(since start: DispatchTime) -> Double {
Double(DispatchTime.now().uptimeNanoseconds &- start.uptimeNanoseconds) / 1_000_000
}
static func human(_ bytes: Int) -> String {
guard bytes > 0 else { return "0 B" }
let units = ["B", "KB", "MB", "GB", "TB"]
var value = Double(bytes)
var i = 0
while value >= 1024 && i < units.count - 1 {
value /= 1024
i += 1
}
return String(format: "%.1f %@", value, units[i])
}
}
/// Result of a `gpu-check` run, with text + JSON renderings.
private struct Report {
let appleGPU: Bool
let architecture: String
let totalMemory: Int
let maxWorkingSet: Int
let metallibPath: String
let metallibFound: Bool
let probe: Int
let coldMs: Double
let warmMs: Double
let checksum: Float
let memoryDelta: Int
let pass: Bool
var text: String {
var lines: [String] = []
lines.append("madcat-say gpu-check")
lines.append("--------------------")
lines.append("Metal GPU : \(architecture) (Apple Silicon: \(appleGPU ? "YES" : "NO"))")
lines.append("Total memory : \(GpuCheck.human(totalMemory))")
lines.append("Max working set : \(GpuCheck.human(maxWorkingSet))")
lines.append("MLX metallib : \(metallibFound ? "found" : "MISSING") (\(metallibPath))")
if !metallibFound {
lines.append(" WARNING: shaders will JIT-compile (~5x slower) or fail to load.")
lines.append(" Fix: run `make` (copies speech-swift's mlx.metallib next to the binary).")
}
lines.append("Compute probe : \(probe)x\(probe) matmul on GPU")
lines.append(String(format: " cold eval : %.2f ms (Metal pipeline / metallib load)", coldMs))
lines.append(String(format: " warm eval : %.2f ms", warmMs))
lines.append(" GPU mem delta : \(GpuCheck.human(max(0, memoryDelta))) active")
lines.append(String(format: " checksum : %.0f", checksum))
lines.append("VERDICT: " + (pass
? "PASS - MLX inference will run on the Metal GPU."
: "FAIL - no Apple Metal GPU detected; inference would fall back to CPU."))
return lines.joined(separator: "\n")
}
var json: String {
let payload: [String: Any] = [
"apple_gpu": appleGPU,
"architecture": architecture,
"total_memory_bytes": totalMemory,
"max_recommended_working_set_bytes": maxWorkingSet,
"metallib_path": metallibPath,
"metallib_found": metallibFound,
"probe_dim": probe,
"cold_eval_ms": coldMs,
"warm_eval_ms": warmMs,
"active_memory_delta_bytes": memoryDelta,
"checksum": Double(checksum),
"verdict": pass ? "PASS" : "FAIL"
]
guard let data = try? JSONSerialization.data(
withJSONObject: payload, options: [.prettyPrinted, .sortedKeys]
) else {
return "{\"verdict\":\"\(pass ? "PASS" : "FAIL")\"}"
}
return String(decoding: data, as: UTF8.self)
}
}