da04416ea4
Add two subcommands and the deps they need, shipped as madcat-say 0.1.0.
gpu-check (Sources/GpuCheck.swift)
Queries the Metal device and runs a small MLX compute probe to verify
the GPU pipeline + bundled mlx.metallib resolve before synth.
serve (Sources/Serve.swift)
Loopback HTTP daemon (default 127.0.0.1:8765, Hummingbird). Loads
VoxCPM2 once behind an actor (serializes the single GPU), warms the
pipeline at boot, caches the last reference voice. Routes:
GET /health -> {status,model,ready,uptime_s}
POST /v1/audio/speech {input|text, voice?, language?, timesteps?,
cfg?, prepad?} -> audio/wav
Cuts warm synth to ~3.2s vs ~6.3s cold (in-process model load).
speak (Sources/MadcatSay.swift)
Probes the daemon /health (0.6s) and forwards over HTTP when up;
falls back to in-process load otherwise. New flags --daemon-port,
--no-daemon.
Package.swift: add mlx-swift (GPU probe) and hummingbird 2.5..<2.17
(HTTP only, no WebSocket — avoids the swift-websocket pin).
Makefile: resolve the metallib via `swift build --show-bin-path` on
both packages instead of the triple-prefixed/symlink path, which is
not always present in speech-swift.
232 lines
8.5 KiB
Swift
232 lines
8.5 KiB
Swift
import ArgumentParser
|
|
import AudioCommon
|
|
import Foundation
|
|
import Hummingbird
|
|
import NIOCore
|
|
@preconcurrency import VoxCPM2TTS
|
|
|
|
/// `madcat-say serve` — resident VoxCPM2 TTS daemon.
|
|
///
|
|
/// Loads the model once, warms the Metal pipeline, and keeps it resident in GPU
|
|
/// memory, serving synthesis over a loopback HTTP API. This pays the multi-GB
|
|
/// cold model load exactly once (at startup) instead of on every `madcat-say`
|
|
/// invocation. The `speak` subcommand probes this daemon's `/health` and
|
|
/// forwards to it when up, falling back to an in-process load when it is not.
|
|
struct Serve: AsyncParsableCommand {
|
|
static let configuration = CommandConfiguration(
|
|
commandName: "serve",
|
|
abstract: "Run the resident VoxCPM2 TTS daemon (load once, stay warm)."
|
|
)
|
|
|
|
@Option(name: .long, help: "Host to bind (default: 127.0.0.1).")
|
|
var host: String = "127.0.0.1"
|
|
|
|
@Option(name: .long, help: "Port to bind (default: 8765).")
|
|
var port: Int = 8765
|
|
|
|
@Option(name: .long, help: "Model variant: int4 (default), int8, bf16.")
|
|
var variant: String = "int4"
|
|
|
|
@Option(name: .long, help: "Default diffusion timesteps per patch (default 10).")
|
|
var timesteps: Int = 10
|
|
|
|
@Option(name: .long, help: "Default classifier-free guidance scale (default 2.0).")
|
|
var cfg: Float = 2.0
|
|
|
|
@Flag(name: .long, help: "Skip the warmup synthesis at startup.")
|
|
var noWarmup: Bool = false
|
|
|
|
func run() async throws {
|
|
let engine = SynthEngine(variant: variant, defaultTimesteps: timesteps, defaultCfg: cfg)
|
|
let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
|
|
|
|
log("Loading \(modelId) ...")
|
|
try await engine.load()
|
|
if !noWarmup {
|
|
log("Warming up GPU pipeline ...")
|
|
try await engine.warmup()
|
|
}
|
|
log("Model resident. Serving on http://\(host):\(port)")
|
|
log(" GET /health")
|
|
log(" POST /v1/audio/speech {input|text, voice?, language?, timesteps?, cfg?, prepad?}")
|
|
|
|
let started = Date()
|
|
let router = Router()
|
|
|
|
router.get("/health") { _, _ -> Response in
|
|
let ready = await engine.isReady
|
|
let uptime = Int(Date().timeIntervalSince(started))
|
|
let json = "{\"status\":\"\(ready ? "ok" : "loading")\",\"model\":\"\(modelId)\",\"ready\":\(ready),\"uptime_s\":\(uptime)}"
|
|
return Response(
|
|
status: ready ? .ok : .serviceUnavailable,
|
|
headers: [.contentType: "application/json"],
|
|
body: .init(byteBuffer: .init(string: json)))
|
|
}
|
|
|
|
router.post("/v1/audio/speech") { request, _ -> Response in
|
|
let body = try await request.body.collect(upTo: 1 << 20)
|
|
let req = SpeechRequest.parse(body)
|
|
guard let text = req.text, !text.isEmpty else {
|
|
return Self.errorResponse("Missing 'input' (or 'text') field.", status: .badRequest)
|
|
}
|
|
do {
|
|
let result = try await engine.synthesize(
|
|
text: text,
|
|
language: req.language,
|
|
referencePath: req.voice,
|
|
timesteps: req.timesteps,
|
|
cfg: req.cfg,
|
|
prepad: req.prepad ?? 0.1)
|
|
let wav = try Self.encodeWAV(samples: result.samples, sampleRate: result.sampleRate)
|
|
return Response(
|
|
status: .ok,
|
|
headers: [.contentType: "audio/wav"],
|
|
body: .init(byteBuffer: .init(data: wav)))
|
|
} catch {
|
|
return Self.errorResponse("Synthesis failed: \(error)", status: .internalServerError)
|
|
}
|
|
}
|
|
|
|
let app = Application(
|
|
router: router,
|
|
configuration: .init(address: .hostname(host, port: port)))
|
|
try await app.run()
|
|
}
|
|
|
|
// MARK: - Helpers
|
|
|
|
static func encodeWAV(samples: [Float], sampleRate: Int) throws -> Data {
|
|
let tmp = FileManager.default.temporaryDirectory
|
|
.appendingPathComponent("madcat-say-serve-\(UUID().uuidString).wav")
|
|
try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp)
|
|
defer { try? FileManager.default.removeItem(at: tmp) }
|
|
return try Data(contentsOf: tmp)
|
|
}
|
|
|
|
static func errorResponse(_ message: String, status: HTTPResponse.Status) -> Response {
|
|
let data = (try? JSONSerialization.data(withJSONObject: ["error": message])) ?? Data()
|
|
return Response(
|
|
status: status,
|
|
headers: [.contentType: "application/json"],
|
|
body: .init(byteBuffer: .init(data: data)))
|
|
}
|
|
|
|
func log(_ message: String) {
|
|
FileHandle.standardError.write(Data((message + "\n").utf8))
|
|
}
|
|
}
|
|
|
|
// MARK: - Request model
|
|
|
|
/// Parsed `/v1/audio/speech` request. OpenAI-ish: `input` is the text, `voice`
|
|
/// is a reference WAV path (loopback / same host) for zero-shot cloning.
|
|
struct SpeechRequest {
|
|
var text: String?
|
|
var voice: String?
|
|
var language: String?
|
|
var timesteps: Int?
|
|
var cfg: Float?
|
|
var prepad: Double?
|
|
|
|
static func parse(_ body: ByteBuffer) -> SpeechRequest {
|
|
var req = SpeechRequest()
|
|
let data = Data(buffer: body)
|
|
guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
|
|
return req
|
|
}
|
|
req.text = (json["input"] as? String) ?? (json["text"] as? String)
|
|
req.voice = json["voice"] as? String
|
|
req.language = json["language"] as? String
|
|
req.timesteps = json["timesteps"] as? Int
|
|
if let c = json["cfg"] as? Double { req.cfg = Float(c) }
|
|
req.prepad = json["prepad"] as? Double
|
|
return req
|
|
}
|
|
}
|
|
|
|
// MARK: - Resident synthesis engine
|
|
|
|
/// Holds the loaded model and serializes single-GPU access. Caches the most
|
|
/// recently used reference voice so repeated clone calls skip the re-decode.
|
|
actor SynthEngine {
|
|
private var model: VoxCPM2TTSModel?
|
|
private let modelId: String
|
|
private let defaultTimesteps: Int
|
|
private let defaultCfg: Float
|
|
private var cachedReferencePath: String?
|
|
private var cachedReference: [Float]?
|
|
|
|
init(variant: String, defaultTimesteps: Int, defaultCfg: Float) {
|
|
self.modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
|
|
self.defaultTimesteps = defaultTimesteps
|
|
self.defaultCfg = defaultCfg
|
|
}
|
|
|
|
var isReady: Bool { model != nil }
|
|
|
|
func load() async throws {
|
|
if model != nil { return }
|
|
let m = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
|
|
FileHandle.standardError.write(Data(" [\(Int(progress * 100))%] \(status)\n".utf8))
|
|
}
|
|
model = m
|
|
}
|
|
|
|
func warmup() async throws {
|
|
guard let m = model else { return }
|
|
_ = try await m.generateVoxCPM2(
|
|
text: "Warming up.",
|
|
language: nil,
|
|
refAudio: nil,
|
|
inferenceTimesteps: defaultTimesteps,
|
|
cfgValue: defaultCfg)
|
|
}
|
|
|
|
struct Result { let samples: [Float]; let sampleRate: Int }
|
|
|
|
func synthesize(
|
|
text: String,
|
|
language: String?,
|
|
referencePath: String?,
|
|
timesteps: Int?,
|
|
cfg: Float?,
|
|
prepad: Double
|
|
) async throws -> Result {
|
|
if model == nil { try await load() }
|
|
guard let m = model else {
|
|
throw ValidationError("Model failed to load.")
|
|
}
|
|
|
|
var refAudio: [Float]?
|
|
if let path = referencePath, !path.isEmpty {
|
|
if path == cachedReferencePath, let cached = cachedReference {
|
|
refAudio = cached
|
|
} else {
|
|
let url = URL(fileURLWithPath: (path as NSString).expandingTildeInPath)
|
|
guard FileManager.default.fileExists(atPath: url.path) else {
|
|
throw ValidationError("Reference file not found: \(path)")
|
|
}
|
|
let loaded = try AudioFileLoader.load(url: url, targetSampleRate: 16000)
|
|
cachedReferencePath = path
|
|
cachedReference = loaded
|
|
refAudio = loaded
|
|
}
|
|
}
|
|
|
|
let audio = try await m.generateVoxCPM2(
|
|
text: text,
|
|
language: language,
|
|
refAudio: refAudio,
|
|
inferenceTimesteps: timesteps ?? defaultTimesteps,
|
|
cfgValue: cfg ?? defaultCfg)
|
|
guard !audio.isEmpty else {
|
|
throw ValidationError("No audio was generated.")
|
|
}
|
|
|
|
let rate = m.sampleRate
|
|
let padSamples = max(0, Int(prepad * Double(rate)))
|
|
let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio
|
|
return Result(samples: out, sampleRate: rate)
|
|
}
|
|
}
|