feat: gpu-check probe + resident TTS daemon (serve)
Add two subcommands and the deps they need, shipped as madcat-say 0.1.0.
gpu-check (Sources/GpuCheck.swift)
Queries the Metal device and runs a small MLX compute probe to verify
the GPU pipeline + bundled mlx.metallib resolve before synth.
serve (Sources/Serve.swift)
Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads
VoxCPM2 once behind an actor (serializes the single GPU), warms the
pipeline at boot, caches the last reference voice. Routes:
GET /health -> {status,model,ready,uptime_s}
POST /v1/audio/speech {input|text, voice?, language?, timesteps?,
cfg?, prepad?} -> audio/wav
Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load).
speak (Sources/MadcatSay.swift)
Probes the daemon /health (0.6s) and forwards over HTTP when up;
falls back to in-process load otherwise. New flags --daemon-port,
--no-daemon.
Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17
(HTTP only, no WebSocket — avoids the swift-websocket pin).
Makefile: resolve the metallib via `swift build --show-bin-path` on
both packages instead of the triple-prefixed/symlink path, which is
not always present in speech-swift.
This commit is contained in:
+92
-4
@@ -3,16 +3,30 @@ import AudioCommon
|
||||
import Foundation
|
||||
@preconcurrency import VoxCPM2TTS
|
||||
|
||||
/// madcat-say — speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
|
||||
/// madcat-say — on-device voice-cloning TTS using VoxCPM2 (MLX/Metal).
|
||||
///
|
||||
/// Root command. `speak` is the default subcommand, so the historical
|
||||
/// `madcat-say "text"` form keeps working unchanged; `madcat-say gpu-check`
|
||||
/// runs the Metal/GPU diagnostic.
|
||||
@main
|
||||
struct MadcatSay: AsyncParsableCommand {
|
||||
static let configuration = CommandConfiguration(
|
||||
commandName: "madcat-say",
|
||||
abstract: "On-device voice cloning with VoxCPM2 (MLX/Metal).",
|
||||
subcommands: [Speak.self, GpuCheck.self, Serve.self],
|
||||
defaultSubcommand: Speak.self
|
||||
)
|
||||
}
|
||||
|
||||
/// Speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
|
||||
///
|
||||
/// madcat-say "Hello there"
|
||||
/// madcat-say -r samantha.wav "I was calibrated just for you."
|
||||
/// madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha."
|
||||
/// madcat-say -r samantha.wav -o out.wav "Saved instead of played."
|
||||
@main
|
||||
struct MadcatSay: AsyncParsableCommand {
|
||||
struct Speak: AsyncParsableCommand {
|
||||
static let configuration = CommandConfiguration(
|
||||
commandName: "madcat-say",
|
||||
commandName: "speak",
|
||||
abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).",
|
||||
discussion: """
|
||||
With no -r, uses the model's default voice. With -r it zero-shot clones
|
||||
@@ -46,7 +60,19 @@ struct MadcatSay: AsyncParsableCommand {
|
||||
@Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).")
|
||||
var prepad: Double = 0.1
|
||||
|
||||
@Option(name: .long, help: "Resident daemon port to try first (default 8765).")
|
||||
var daemonPort: Int = 8765
|
||||
|
||||
@Flag(name: .long, help: "Bypass the resident daemon; always load the model in-process.")
|
||||
var noDaemon: Bool = false
|
||||
|
||||
func run() async throws {
|
||||
// Fast path: forward to the resident `serve` daemon if it is up, so the
|
||||
// model stays warm in GPU memory instead of being cold-loaded per call.
|
||||
if !noDaemon, await trySpeakViaDaemon() {
|
||||
return
|
||||
}
|
||||
|
||||
let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
|
||||
log("Loading \(modelId) ...")
|
||||
let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
|
||||
@@ -93,6 +119,68 @@ struct MadcatSay: AsyncParsableCommand {
|
||||
|
||||
// MARK: - Helpers
|
||||
|
||||
/// Try to satisfy this request via the resident `serve` daemon. Returns true
|
||||
/// if the daemon handled it (audio played or saved); false on any failure so
|
||||
/// the caller falls back to an in-process model load. Never throws — a down
|
||||
/// or broken daemon must degrade gracefully, not abort the command.
|
||||
private func trySpeakViaDaemon() async -> Bool {
|
||||
let base = "http://127.0.0.1:\(daemonPort)"
|
||||
let session = URLSession(configuration: .ephemeral)
|
||||
|
||||
// Health probe with a short timeout so a missing daemon costs ~nothing.
|
||||
guard let healthURL = URL(string: base + "/health") else { return false }
|
||||
var healthReq = URLRequest(url: healthURL)
|
||||
healthReq.timeoutInterval = 0.6
|
||||
do {
|
||||
let (_, resp) = try await session.data(for: healthReq)
|
||||
guard let http = resp as? HTTPURLResponse, http.statusCode == 200 else { return false }
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
|
||||
// Build the synthesis request.
|
||||
var payload: [String: Any] = ["input": text, "timesteps": timesteps, "cfg": Double(cfg), "prepad": prepad]
|
||||
if let language { payload["language"] = language }
|
||||
if let reference { payload["voice"] = (reference as NSString).expandingTildeInPath }
|
||||
|
||||
guard let speakURL = URL(string: base + "/v1/audio/speech"),
|
||||
let bodyData = try? JSONSerialization.data(withJSONObject: payload) else {
|
||||
return false
|
||||
}
|
||||
var req = URLRequest(url: speakURL)
|
||||
req.httpMethod = "POST"
|
||||
req.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
||||
req.httpBody = bodyData
|
||||
req.timeoutInterval = 300
|
||||
|
||||
log("Using resident daemon on port \(daemonPort) ...")
|
||||
do {
|
||||
let (wav, resp) = try await session.data(for: req)
|
||||
guard let http = resp as? HTTPURLResponse, http.statusCode == 200, !wav.isEmpty else {
|
||||
log("Daemon returned no audio; falling back to in-process load.")
|
||||
return false
|
||||
}
|
||||
if let output {
|
||||
try wav.write(to: URL(fileURLWithPath: output))
|
||||
log("Saved \(wav.count) bytes to \(output)")
|
||||
} else {
|
||||
let tmp = FileManager.default.temporaryDirectory
|
||||
.appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
|
||||
try wav.write(to: tmp)
|
||||
defer { try? FileManager.default.removeItem(at: tmp) }
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
|
||||
proc.arguments = [tmp.path]
|
||||
try proc.run()
|
||||
proc.waitUntilExit()
|
||||
}
|
||||
return true
|
||||
} catch {
|
||||
log("Daemon request failed (\(error)); falling back to in-process load.")
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws {
|
||||
let tmp = FileManager.default.temporaryDirectory
|
||||
.appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
|
||||
|
||||
Reference in New Issue
Block a user