import ArgumentParser import AudioCommon import Foundation import Hummingbird import NIOCore @preconcurrency import VoxCPM2TTS /// `madcat-say serve` — resident VoxCPM2 TTS daemon. /// /// Loads the model once, warms the Metal pipeline, and keeps it resident in GPU /// memory, serving synthesis over a loopback HTTP API. This pays the multi-GB /// cold model load exactly once (at startup) instead of on every `madcat-say` /// invocation. The `speak` subcommand probes this daemon's `/health` and /// forwards to it when up, falling back to an in-process load when it is not. struct Serve: AsyncParsableCommand { static let configuration = CommandConfiguration( commandName: "serve", abstract: "Run the resident VoxCPM2 TTS daemon (load once, stay warm)." ) @Option(name: .long, help: "Host to bind (default: 127.0.0.1).") var host: String = "127.0.0.1" @Option(name: .long, help: "Port to bind (default: 8765).") var port: Int = 8765 @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.") var variant: String = "int4" @Option(name: .long, help: "Default diffusion timesteps per patch (default 10).") var timesteps: Int = 10 @Option(name: .long, help: "Default classifier-free guidance scale (default 2.0).") var cfg: Float = 2.0 @Flag(name: .long, help: "Skip the warmup synthesis at startup.") var noWarmup: Bool = false func run() async throws { let engine = SynthEngine(variant: variant, defaultTimesteps: timesteps, defaultCfg: cfg) let modelId = "aufklarer/VoxCPM2-MLX-\(variant)" log("Loading \(modelId) ...") try await engine.load() if !noWarmup { log("Warming up GPU pipeline ...") try await engine.warmup() } log("Model resident. Serving on http://\(host):\(port)") log(" GET /health") log(" POST /v1/audio/speech {input|text, voice?, language?, timesteps?, cfg?, prepad?}") let started = Date() let router = Router() router.get("/health") { _, _ -> Response in let ready = await engine.isReady let uptime = Int(Date().timeIntervalSince(started)) let json = "{\"status\":\"\(ready ? "ok" : "loading")\",\"model\":\"\(modelId)\",\"ready\":\(ready),\"uptime_s\":\(uptime)}" return Response( status: ready ? .ok : .serviceUnavailable, headers: [.contentType: "application/json"], body: .init(byteBuffer: .init(string: json))) } router.post("/v1/audio/speech") { request, _ -> Response in let body = try await request.body.collect(upTo: 1 << 20) let req = SpeechRequest.parse(body) guard let text = req.text, !text.isEmpty else { return Self.errorResponse("Missing 'input' (or 'text') field.", status: .badRequest) } do { let result = try await engine.synthesize( text: text, language: req.language, referencePath: req.voice, timesteps: req.timesteps, cfg: req.cfg, prepad: req.prepad ?? 0.1) let wav = try Self.encodeWAV(samples: result.samples, sampleRate: result.sampleRate) return Response( status: .ok, headers: [.contentType: "audio/wav"], body: .init(byteBuffer: .init(data: wav))) } catch { return Self.errorResponse("Synthesis failed: \(error)", status: .internalServerError) } } let app = Application( router: router, configuration: .init(address: .hostname(host, port: port))) try await app.run() } // MARK: - Helpers static func encodeWAV(samples: [Float], sampleRate: Int) throws -> Data { let tmp = FileManager.default.temporaryDirectory .appendingPathComponent("madcat-say-serve-\(UUID().uuidString).wav") try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp) defer { try? FileManager.default.removeItem(at: tmp) } return try Data(contentsOf: tmp) } static func errorResponse(_ message: String, status: HTTPResponse.Status) -> Response { let data = (try? JSONSerialization.data(withJSONObject: ["error": message])) ?? Data() return Response( status: status, headers: [.contentType: "application/json"], body: .init(byteBuffer: .init(data: data))) } func log(_ message: String) { FileHandle.standardError.write(Data((message + "\n").utf8)) } } // MARK: - Request model /// Parsed `/v1/audio/speech` request. OpenAI-ish: `input` is the text, `voice` /// is a reference WAV path (loopback / same host) for zero-shot cloning. struct SpeechRequest { var text: String? var voice: String? var language: String? var timesteps: Int? var cfg: Float? var prepad: Double? static func parse(_ body: ByteBuffer) -> SpeechRequest { var req = SpeechRequest() let data = Data(buffer: body) guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { return req } req.text = (json["input"] as? String) ?? (json["text"] as? String) req.voice = json["voice"] as? String req.language = json["language"] as? String req.timesteps = json["timesteps"] as? Int if let c = json["cfg"] as? Double { req.cfg = Float(c) } req.prepad = json["prepad"] as? Double return req } } // MARK: - Resident synthesis engine /// Holds the loaded model and serializes single-GPU access. Caches the most /// recently used reference voice so repeated clone calls skip the re-decode. actor SynthEngine { private var model: VoxCPM2TTSModel? private let modelId: String private let defaultTimesteps: Int private let defaultCfg: Float private var cachedReferencePath: String? private var cachedReference: [Float]? init(variant: String, defaultTimesteps: Int, defaultCfg: Float) { self.modelId = "aufklarer/VoxCPM2-MLX-\(variant)" self.defaultTimesteps = defaultTimesteps self.defaultCfg = defaultCfg } var isReady: Bool { model != nil } func load() async throws { if model != nil { return } let m = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in FileHandle.standardError.write(Data(" [\(Int(progress * 100))%] \(status)\n".utf8)) } model = m } func warmup() async throws { guard let m = model else { return } _ = try await m.generateVoxCPM2( text: "Warming up.", language: nil, refAudio: nil, inferenceTimesteps: defaultTimesteps, cfgValue: defaultCfg) } struct Result { let samples: [Float]; let sampleRate: Int } func synthesize( text: String, language: String?, referencePath: String?, timesteps: Int?, cfg: Float?, prepad: Double ) async throws -> Result { if model == nil { try await load() } guard let m = model else { throw ValidationError("Model failed to load.") } var refAudio: [Float]? if let path = referencePath, !path.isEmpty { if path == cachedReferencePath, let cached = cachedReference { refAudio = cached } else { let url = URL(fileURLWithPath: (path as NSString).expandingTildeInPath) guard FileManager.default.fileExists(atPath: url.path) else { throw ValidationError("Reference file not found: \(path)") } let loaded = try AudioFileLoader.load(url: url, targetSampleRate: 16000) cachedReferencePath = path cachedReference = loaded refAudio = loaded } } let audio = try await m.generateVoxCPM2( text: text, language: language, refAudio: refAudio, inferenceTimesteps: timesteps ?? defaultTimesteps, cfgValue: cfg ?? defaultCfg) guard !audio.isEmpty else { throw ValidationError("No audio was generated.") } let rate = m.sampleRate let padSamples = max(0, Int(prepad * Double(rate))) let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio return Result(samples: out, sampleRate: rate) } }