feat: gpu-check probe + resident TTS daemon (serve)

Add two subcommands and the deps they need, shipped as madcat-say 0.1.0.

gpu-check (Sources/GpuCheck.swift)
  Queries the Metal device and runs a small MLX compute probe to verify
  the GPU pipeline + bundled mlx.metallib resolve before synth.

serve (Sources/Serve.swift)
  Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads
  VoxCPM2 once behind an actor (serializes the single GPU), warms the
  pipeline at boot, caches the last reference voice. Routes:
    GET  /health            -> {status,model,ready,uptime_s}
    POST /v1/audio/speech   {input|text, voice?, language?, timesteps?,
                             cfg?, prepad?} -> audio/wav
  Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load).

speak (Sources/MadcatSay.swift)
  Probes the daemon /health (0.6s) and forwards over HTTP when up;
  falls back to in-process load otherwise. New flags --daemon-port,
  --no-daemon.

Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17
  (HTTP only, no WebSocket — avoids the swift-websocket pin).

Makefile: resolve the metallib via `swift build --show-bin-path` on
  both packages instead of the triple-prefixed/symlink path, which is
  not always present in speech-swift.
This commit is contained in:
madcat-core
2026-06-10 21:04:19 +02:00
parent 67be877f7f
commit da04416ea4
5 changed files with 520 additions and 11 deletions
+177
View File
@@ -0,0 +1,177 @@
import ArgumentParser
import Dispatch
import Foundation
import MLX
/// `madcat-say gpu-check` confirm that MLX inference will run on the Apple
/// Silicon Metal GPU rather than silently falling back to CPU, and that the
/// compiled Metal shader library (`mlx.metallib`) is in place next to the
/// binary.
///
/// What it checks:
/// 1. `GPU.deviceInfo()` reports an Apple GPU architecture (Metal present).
/// 2. `mlx.metallib` exists next to the executable. Missing => shaders are
/// JIT-compiled (~5x slower), or inference aborts with
/// "Failed to load the default metallib".
/// 3. A real matmul is forced to completion with `eval()`; the MLX
/// active-memory delta proves the work was allocated on the Metal GPU.
///
/// Exit code: 0 = PASS (Apple Metal GPU is the compute device), 2 = FAIL.
struct GpuCheck: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "gpu-check",
abstract: "Confirm MLX is using the Apple Silicon Metal GPU (not CPU)."
)
@Flag(name: .long, help: "Emit machine-readable JSON instead of text.")
var json = false
@Option(name: .long, help: "Edge length N of the NxN matmul probe (default 1024).")
var probe: Int = 1024
func run() async throws {
// 1. Metal device identity. Only a Metal device answers deviceInfo();
// on Apple Silicon the architecture string contains "apple".
let info = GPU.deviceInfo()
let architecture = info.architecture
let isAppleGPU = architecture.lowercased().contains("apple")
let totalMemory = info.memorySize
let maxWorkingSet = Int(info.maxRecommendedWorkingSetSize)
// 2. metallib presence (project convention: copied next to the binary).
let (metallibPath, metallibFound) = Self.metallibStatus()
// 3. Live GPU compute probe: matmul -> eval(). The cold eval pays the
// Metal pipeline / metallib load; the warm eval is steady state.
let n = max(8, probe)
let activeBefore = Memory.activeMemory
let a = ones([n, n])
let b = ones([n, n])
let coldStart = DispatchTime.now()
var c = matmul(a, b)
eval(c)
let coldMs = Self.elapsedMs(since: coldStart)
let warmStart = DispatchTime.now()
c = matmul(c, b)
eval(c)
let warmMs = Self.elapsedMs(since: warmStart)
let checksum = c.sum().item(Float.self)
let memoryDelta = Memory.activeMemory - activeBefore
let pass = isAppleGPU
let report = Report(
appleGPU: isAppleGPU,
architecture: architecture,
totalMemory: totalMemory,
maxWorkingSet: maxWorkingSet,
metallibPath: metallibPath,
metallibFound: metallibFound,
probe: n,
coldMs: coldMs,
warmMs: warmMs,
checksum: checksum,
memoryDelta: memoryDelta,
pass: pass
)
print(json ? report.json : report.text)
if !pass {
throw ExitCode(2)
}
}
// MARK: - Helpers
/// MLX expects `mlx.metallib` next to the executable (see the Makefile,
/// which copies it out of speech-swift's build dir).
private static func metallibStatus() -> (path: String, found: Bool) {
let exe = Bundle.main.executableURL
?? URL(fileURLWithPath: CommandLine.arguments.first ?? "madcat-say")
let dir = exe.resolvingSymlinksInPath().deletingLastPathComponent()
let candidate = dir.appendingPathComponent("mlx.metallib")
return (candidate.path, FileManager.default.fileExists(atPath: candidate.path))
}
private static func elapsedMs(since start: DispatchTime) -> Double {
Double(DispatchTime.now().uptimeNanoseconds &- start.uptimeNanoseconds) / 1_000_000
}
static func human(_ bytes: Int) -> String {
guard bytes > 0 else { return "0 B" }
let units = ["B", "KB", "MB", "GB", "TB"]
var value = Double(bytes)
var i = 0
while value >= 1024 && i < units.count - 1 {
value /= 1024
i += 1
}
return String(format: "%.1f %@", value, units[i])
}
}
/// Result of a `gpu-check` run, with text + JSON renderings.
private struct Report {
let appleGPU: Bool
let architecture: String
let totalMemory: Int
let maxWorkingSet: Int
let metallibPath: String
let metallibFound: Bool
let probe: Int
let coldMs: Double
let warmMs: Double
let checksum: Float
let memoryDelta: Int
let pass: Bool
var text: String {
var lines: [String] = []
lines.append("madcat-say gpu-check")
lines.append("--------------------")
lines.append("Metal GPU : \(architecture) (Apple Silicon: \(appleGPU ? "YES" : "NO"))")
lines.append("Total memory : \(GpuCheck.human(totalMemory))")
lines.append("Max working set : \(GpuCheck.human(maxWorkingSet))")
lines.append("MLX metallib : \(metallibFound ? "found" : "MISSING") (\(metallibPath))")
if !metallibFound {
lines.append(" WARNING: shaders will JIT-compile (~5x slower) or fail to load.")
lines.append(" Fix: run `make` (copies speech-swift's mlx.metallib next to the binary).")
}
lines.append("Compute probe : \(probe)x\(probe) matmul on GPU")
lines.append(String(format: " cold eval : %.2f ms (Metal pipeline / metallib load)", coldMs))
lines.append(String(format: " warm eval : %.2f ms", warmMs))
lines.append(" GPU mem delta : \(GpuCheck.human(max(0, memoryDelta))) active")
lines.append(String(format: " checksum : %.0f", checksum))
lines.append("VERDICT: " + (pass
? "PASS - MLX inference will run on the Metal GPU."
: "FAIL - no Apple Metal GPU detected; inference would fall back to CPU."))
return lines.joined(separator: "\n")
}
var json: String {
let payload: [String: Any] = [
"apple_gpu": appleGPU,
"architecture": architecture,
"total_memory_bytes": totalMemory,
"max_recommended_working_set_bytes": maxWorkingSet,
"metallib_path": metallibPath,
"metallib_found": metallibFound,
"probe_dim": probe,
"cold_eval_ms": coldMs,
"warm_eval_ms": warmMs,
"active_memory_delta_bytes": memoryDelta,
"checksum": Double(checksum),
"verdict": pass ? "PASS" : "FAIL"
]
guard let data = try? JSONSerialization.data(
withJSONObject: payload, options: [.prettyPrinted, .sortedKeys]
) else {
return "{\"verdict\":\"\(pass ? "PASS" : "FAIL")\"}"
}
return String(decoding: data, as: UTF8.self)
}
}
+92 -4
View File
@@ -3,16 +3,30 @@ import AudioCommon
import Foundation
@preconcurrency import VoxCPM2TTS
/// madcat-say speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
/// madcat-say on-device voice-cloning TTS using VoxCPM2 (MLX/Metal).
///
/// Root command. `speak` is the default subcommand, so the historical
/// `madcat-say "text"` form keeps working unchanged; `madcat-say gpu-check`
/// runs the Metal/GPU diagnostic.
@main
struct MadcatSay: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "madcat-say",
abstract: "On-device voice cloning with VoxCPM2 (MLX/Metal).",
subcommands: [Speak.self, GpuCheck.self, Serve.self],
defaultSubcommand: Speak.self
)
}
/// Speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
///
/// madcat-say "Hello there"
/// madcat-say -r samantha.wav "I was calibrated just for you."
/// madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha."
/// madcat-say -r samantha.wav -o out.wav "Saved instead of played."
@main
struct MadcatSay: AsyncParsableCommand {
struct Speak: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "madcat-say",
commandName: "speak",
abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).",
discussion: """
With no -r, uses the model's default voice. With -r it zero-shot clones
@@ -46,7 +60,19 @@ struct MadcatSay: AsyncParsableCommand {
@Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).")
var prepad: Double = 0.1
@Option(name: .long, help: "Resident daemon port to try first (default 8765).")
var daemonPort: Int = 8765
@Flag(name: .long, help: "Bypass the resident daemon; always load the model in-process.")
var noDaemon: Bool = false
func run() async throws {
// Fast path: forward to the resident `serve` daemon if it is up, so the
// model stays warm in GPU memory instead of being cold-loaded per call.
if !noDaemon, await trySpeakViaDaemon() {
return
}
let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
log("Loading \(modelId) ...")
let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
@@ -93,6 +119,68 @@ struct MadcatSay: AsyncParsableCommand {
// MARK: - Helpers
/// Try to satisfy this request via the resident `serve` daemon. Returns true
/// if the daemon handled it (audio played or saved); false on any failure so
/// the caller falls back to an in-process model load. Never throws a down
/// or broken daemon must degrade gracefully, not abort the command.
private func trySpeakViaDaemon() async -> Bool {
let base = "http://127.0.0.1:\(daemonPort)"
let session = URLSession(configuration: .ephemeral)
// Health probe with a short timeout so a missing daemon costs ~nothing.
guard let healthURL = URL(string: base + "/health") else { return false }
var healthReq = URLRequest(url: healthURL)
healthReq.timeoutInterval = 0.6
do {
let (_, resp) = try await session.data(for: healthReq)
guard let http = resp as? HTTPURLResponse, http.statusCode == 200 else { return false }
} catch {
return false
}
// Build the synthesis request.
var payload: [String: Any] = ["input": text, "timesteps": timesteps, "cfg": Double(cfg), "prepad": prepad]
if let language { payload["language"] = language }
if let reference { payload["voice"] = (reference as NSString).expandingTildeInPath }
guard let speakURL = URL(string: base + "/v1/audio/speech"),
let bodyData = try? JSONSerialization.data(withJSONObject: payload) else {
return false
}
var req = URLRequest(url: speakURL)
req.httpMethod = "POST"
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
req.httpBody = bodyData
req.timeoutInterval = 300
log("Using resident daemon on port \(daemonPort) ...")
do {
let (wav, resp) = try await session.data(for: req)
guard let http = resp as? HTTPURLResponse, http.statusCode == 200, !wav.isEmpty else {
log("Daemon returned no audio; falling back to in-process load.")
return false
}
if let output {
try wav.write(to: URL(fileURLWithPath: output))
log("Saved \(wav.count) bytes to \(output)")
} else {
let tmp = FileManager.default.temporaryDirectory
.appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
try wav.write(to: tmp)
defer { try? FileManager.default.removeItem(at: tmp) }
let proc = Process()
proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
proc.arguments = [tmp.path]
try proc.run()
proc.waitUntilExit()
}
return true
} catch {
log("Daemon request failed (\(error)); falling back to in-process load.")
return false
}
}
private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws {
let tmp = FileManager.default.temporaryDirectory
.appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
+231
View File
@@ -0,0 +1,231 @@
import ArgumentParser
import AudioCommon
import Foundation
import Hummingbird
import NIOCore
@preconcurrency import VoxCPM2TTS
/// `madcat-say serve` resident VoxCPM2 TTS daemon.
///
/// Loads the model once, warms the Metal pipeline, and keeps it resident in GPU
/// memory, serving synthesis over a loopback HTTP API. This pays the multi-GB
/// cold model load exactly once (at startup) instead of on every `madcat-say`
/// invocation. The `speak` subcommand probes this daemon's `/health` and
/// forwards to it when up, falling back to an in-process load when it is not.
struct Serve: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "serve",
abstract: "Run the resident VoxCPM2 TTS daemon (load once, stay warm)."
)
@Option(name: .long, help: "Host to bind (default: 127.0.0.1).")
var host: String = "127.0.0.1"
@Option(name: .long, help: "Port to bind (default: 8765).")
var port: Int = 8765
@Option(name: .long, help: "Model variant: int4 (default), int8, bf16.")
var variant: String = "int4"
@Option(name: .long, help: "Default diffusion timesteps per patch (default 10).")
var timesteps: Int = 10
@Option(name: .long, help: "Default classifier-free guidance scale (default 2.0).")
var cfg: Float = 2.0
@Flag(name: .long, help: "Skip the warmup synthesis at startup.")
var noWarmup: Bool = false
func run() async throws {
let engine = SynthEngine(variant: variant, defaultTimesteps: timesteps, defaultCfg: cfg)
let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
log("Loading \(modelId) ...")
try await engine.load()
if !noWarmup {
log("Warming up GPU pipeline ...")
try await engine.warmup()
}
log("Model resident. Serving on http://\(host):\(port)")
log(" GET /health")
log(" POST /v1/audio/speech {input|text, voice?, language?, timesteps?, cfg?, prepad?}")
let started = Date()
let router = Router()
router.get("/health") { _, _ -> Response in
let ready = await engine.isReady
let uptime = Int(Date().timeIntervalSince(started))
let json = "{\"status\":\"\(ready ? "ok" : "loading")\",\"model\":\"\(modelId)\",\"ready\":\(ready),\"uptime_s\":\(uptime)}"
return Response(
status: ready ? .ok : .serviceUnavailable,
headers: [.contentType: "application/json"],
body: .init(byteBuffer: .init(string: json)))
}
router.post("/v1/audio/speech") { request, _ -> Response in
let body = try await request.body.collect(upTo: 1 << 20)
let req = SpeechRequest.parse(body)
guard let text = req.text, !text.isEmpty else {
return Self.errorResponse("Missing 'input' (or 'text') field.", status: .badRequest)
}
do {
let result = try await engine.synthesize(
text: text,
language: req.language,
referencePath: req.voice,
timesteps: req.timesteps,
cfg: req.cfg,
prepad: req.prepad ?? 0.1)
let wav = try Self.encodeWAV(samples: result.samples, sampleRate: result.sampleRate)
return Response(
status: .ok,
headers: [.contentType: "audio/wav"],
body: .init(byteBuffer: .init(data: wav)))
} catch {
return Self.errorResponse("Synthesis failed: \(error)", status: .internalServerError)
}
}
let app = Application(
router: router,
configuration: .init(address: .hostname(host, port: port)))
try await app.run()
}
// MARK: - Helpers
static func encodeWAV(samples: [Float], sampleRate: Int) throws -> Data {
let tmp = FileManager.default.temporaryDirectory
.appendingPathComponent("madcat-say-serve-\(UUID().uuidString).wav")
try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp)
defer { try? FileManager.default.removeItem(at: tmp) }
return try Data(contentsOf: tmp)
}
static func errorResponse(_ message: String, status: HTTPResponse.Status) -> Response {
let data = (try? JSONSerialization.data(withJSONObject: ["error": message])) ?? Data()
return Response(
status: status,
headers: [.contentType: "application/json"],
body: .init(byteBuffer: .init(data: data)))
}
func log(_ message: String) {
FileHandle.standardError.write(Data((message + "\n").utf8))
}
}
// MARK: - Request model
/// Parsed `/v1/audio/speech` request. OpenAI-ish: `input` is the text, `voice`
/// is a reference WAV path (loopback / same host) for zero-shot cloning.
struct SpeechRequest {
var text: String?
var voice: String?
var language: String?
var timesteps: Int?
var cfg: Float?
var prepad: Double?
static func parse(_ body: ByteBuffer) -> SpeechRequest {
var req = SpeechRequest()
let data = Data(buffer: body)
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
return req
}
req.text = (json["input"] as? String) ?? (json["text"] as? String)
req.voice = json["voice"] as? String
req.language = json["language"] as? String
req.timesteps = json["timesteps"] as? Int
if let c = json["cfg"] as? Double { req.cfg = Float(c) }
req.prepad = json["prepad"] as? Double
return req
}
}
// MARK: - Resident synthesis engine
/// Holds the loaded model and serializes single-GPU access. Caches the most
/// recently used reference voice so repeated clone calls skip the re-decode.
actor SynthEngine {
private var model: VoxCPM2TTSModel?
private let modelId: String
private let defaultTimesteps: Int
private let defaultCfg: Float
private var cachedReferencePath: String?
private var cachedReference: [Float]?
init(variant: String, defaultTimesteps: Int, defaultCfg: Float) {
self.modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
self.defaultTimesteps = defaultTimesteps
self.defaultCfg = defaultCfg
}
var isReady: Bool { model != nil }
func load() async throws {
if model != nil { return }
let m = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
FileHandle.standardError.write(Data(" [\(Int(progress * 100))%] \(status)\n".utf8))
}
model = m
}
func warmup() async throws {
guard let m = model else { return }
_ = try await m.generateVoxCPM2(
text: "Warming up.",
language: nil,
refAudio: nil,
inferenceTimesteps: defaultTimesteps,
cfgValue: defaultCfg)
}
struct Result { let samples: [Float]; let sampleRate: Int }
func synthesize(
text: String,
language: String?,
referencePath: String?,
timesteps: Int?,
cfg: Float?,
prepad: Double
) async throws -> Result {
if model == nil { try await load() }
guard let m = model else {
throw ValidationError("Model failed to load.")
}
var refAudio: [Float]?
if let path = referencePath, !path.isEmpty {
if path == cachedReferencePath, let cached = cachedReference {
refAudio = cached
} else {
let url = URL(fileURLWithPath: (path as NSString).expandingTildeInPath)
guard FileManager.default.fileExists(atPath: url.path) else {
throw ValidationError("Reference file not found: \(path)")
}
let loaded = try AudioFileLoader.load(url: url, targetSampleRate: 16000)
cachedReferencePath = path
cachedReference = loaded
refAudio = loaded
}
}
let audio = try await m.generateVoxCPM2(
text: text,
language: language,
refAudio: refAudio,
inferenceTimesteps: timesteps ?? defaultTimesteps,
cfgValue: cfg ?? defaultCfg)
guard !audio.isEmpty else {
throw ValidationError("No audio was generated.")
}
let rate = m.sampleRate
let padSamples = max(0, Int(prepad * Double(rate)))
let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio
return Result(samples: out, sampleRate: rate)
}
}