import ArgumentParser import AudioCommon import Foundation @preconcurrency import VoxCPM2TTS /// madcat-say — on-device voice-cloning TTS using VoxCPM2 (MLX/Metal). /// /// Root command. `speak` is the default subcommand, so the historical /// `madcat-say "text"` form keeps working unchanged; `madcat-say gpu-check` /// runs the Metal/GPU diagnostic. @main struct MadcatSay: AsyncParsableCommand { static let configuration = CommandConfiguration( commandName: "madcat-say", abstract: "On-device voice cloning with VoxCPM2 (MLX/Metal).", subcommands: [Speak.self, GpuCheck.self, Serve.self], defaultSubcommand: Speak.self ) } /// Speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal). /// /// madcat-say "Hello there" /// madcat-say -r samantha.wav "I was calibrated just for you." /// madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha." /// madcat-say -r samantha.wav -o out.wav "Saved instead of played." struct Speak: AsyncParsableCommand { static let configuration = CommandConfiguration( commandName: "speak", abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).", discussion: """ With no -r, uses the model's default voice. With -r it zero-shot clones the reference voice. Language is auto-detected from the text; -l is an optional hint. By default the audio is played through the speakers; pass -o to write a 48 kHz WAV instead. """ ) @Argument(help: "The text to speak.") var text: String @Option(name: [.customShort("r"), .long], help: "Reference voice WAV to clone (any sample rate).") var reference: String? @Option(name: [.customShort("l"), .long], help: "Language hint (auto-detected if omitted).") var language: String? @Option(name: [.customShort("o"), .long], help: "Write WAV here instead of playing it.") var output: String? @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.") var variant: String = "int4" @Option(name: .long, help: "Diffusion timesteps per patch (default 10; lower = faster, rougher).") var timesteps: Int = 10 @Option(name: .long, help: "Classifier-free guidance scale (default 2.0).") var cfg: Float = 2.0 @Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).") var prepad: Double = 0.1 @Option(name: .long, help: "Resident daemon port to try first (default 8765).") var daemonPort: Int = 8765 @Flag(name: .long, help: "Bypass the resident daemon; always load the model in-process.") var noDaemon: Bool = false func run() async throws { // Fast path: forward to the resident `serve` daemon if it is up, so the // model stays warm in GPU memory instead of being cold-loaded per call. if !noDaemon, await trySpeakViaDaemon() { return } let modelId = "aufklarer/VoxCPM2-MLX-\(variant)" log("Loading \(modelId) ...") let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in log(" [\(Int(progress * 100))%] \(status)") } defer { model.unload() } var refAudio: [Float]? = nil if let reference { let url = URL(fileURLWithPath: reference) guard FileManager.default.fileExists(atPath: url.path) else { throw ValidationError("Reference file not found: \(reference)") } // VoxCPM2 ingests reference audio at 16 kHz. refAudio = try AudioFileLoader.load(url: url, targetSampleRate: 16000) log(" Reference: \(refAudio?.count ?? 0) samples") } log("Synthesizing ...") let audio = try await model.generateVoxCPM2( text: text, language: language, refAudio: refAudio, inferenceTimesteps: timesteps, cfgValue: cfg ) guard !audio.isEmpty else { throw ValidationError("No audio was generated.") } let rate = model.sampleRate // Prepend leading silence so the first phoneme isn't clipped on playback. let padSamples = max(0, Int(prepad * Double(rate))) let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio if let output { let url = URL(fileURLWithPath: output) try WAVWriter.write(samples: out, sampleRate: rate, to: url) log("Saved \(out.count) samples (\(seconds(out.count, rate))s) to \(output)") } else { try playThroughSpeakers(samples: out, sampleRate: rate) } } // MARK: - Helpers /// Try to satisfy this request via the resident `serve` daemon. Returns true /// if the daemon handled it (audio played or saved); false on any failure so /// the caller falls back to an in-process model load. Never throws — a down /// or broken daemon must degrade gracefully, not abort the command. private func trySpeakViaDaemon() async -> Bool { let base = "http://127.0.0.1:\(daemonPort)" let session = URLSession(configuration: .ephemeral) // Health probe with a short timeout so a missing daemon costs ~nothing. guard let healthURL = URL(string: base + "/health") else { return false } var healthReq = URLRequest(url: healthURL) healthReq.timeoutInterval = 0.6 do { let (_, resp) = try await session.data(for: healthReq) guard let http = resp as? HTTPURLResponse, http.statusCode == 200 else { return false } } catch { return false } // Build the synthesis request. var payload: [String: Any] = ["input": text, "timesteps": timesteps, "cfg": Double(cfg), "prepad": prepad] if let language { payload["language"] = language } if let reference { payload["voice"] = (reference as NSString).expandingTildeInPath } guard let speakURL = URL(string: base + "/v1/audio/speech"), let bodyData = try? JSONSerialization.data(withJSONObject: payload) else { return false } var req = URLRequest(url: speakURL) req.httpMethod = "POST" req.setValue("application/json", forHTTPHeaderField: "Content-Type") req.httpBody = bodyData req.timeoutInterval = 300 log("Using resident daemon on port \(daemonPort) ...") do { let (wav, resp) = try await session.data(for: req) guard let http = resp as? HTTPURLResponse, http.statusCode == 200, !wav.isEmpty else { log("Daemon returned no audio; falling back to in-process load.") return false } if let output { try wav.write(to: URL(fileURLWithPath: output)) log("Saved \(wav.count) bytes to \(output)") } else { let tmp = FileManager.default.temporaryDirectory .appendingPathComponent("madcat-say-\(UUID().uuidString).wav") try wav.write(to: tmp) defer { try? FileManager.default.removeItem(at: tmp) } let proc = Process() proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay") proc.arguments = [tmp.path] try proc.run() proc.waitUntilExit() } return true } catch { log("Daemon request failed (\(error)); falling back to in-process load.") return false } } private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws { let tmp = FileManager.default.temporaryDirectory .appendingPathComponent("madcat-say-\(UUID().uuidString).wav") try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp) defer { try? FileManager.default.removeItem(at: tmp) } let proc = Process() proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay") proc.arguments = [tmp.path] try proc.run() proc.waitUntilExit() } private func seconds(_ count: Int, _ rate: Int) -> String { String(format: "%.2f", Double(count) / Double(rate)) } /// Progress/status goes to stderr so stdout stays clean for piping. private func log(_ message: String) { FileHandle.standardError.write(Data((message + "\n").utf8)) } }