initial commit

On-device voice cloning CLI using VoxCPM2 via MLX on macOS. Swift 6, macOS 15+, depends on speech-swift (local path dep).
2026-05-31 14:34:59 +02:00
commit 67be877f7f
5 changed files with 497 additions and 0 deletions
@@ -0,0 +1,117 @@
+import ArgumentParser
+import AudioCommon
+import Foundation
+@preconcurrency import VoxCPM2TTS
+
+/// madcat-say — speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
+///
+///   madcat-say "Hello there"
+///   madcat-say -r samantha.wav "I was calibrated just for you."
+///   madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha."
+///   madcat-say -r samantha.wav -o out.wav "Saved instead of played."
+@main
+struct MadcatSay: AsyncParsableCommand {
+    static let configuration = CommandConfiguration(
+        commandName: "madcat-say",
+        abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).",
+        discussion: """
+        With no -r, uses the model's default voice. With -r it zero-shot clones
+        the reference voice. Language is auto-detected from the text; -l is an
+        optional hint. By default the audio is played through the speakers; pass
+        -o to write a 48 kHz WAV instead.
+        """
+    )
+
+    @Argument(help: "The text to speak.")
+    var text: String
+
+    @Option(name: [.customShort("r"), .long], help: "Reference voice WAV to clone (any sample rate).")
+    var reference: String?
+
+    @Option(name: [.customShort("l"), .long], help: "Language hint (auto-detected if omitted).")
+    var language: String?
+
+    @Option(name: [.customShort("o"), .long], help: "Write WAV here instead of playing it.")
+    var output: String?
+
+    @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.")
+    var variant: String = "int4"
+
+    @Option(name: .long, help: "Diffusion timesteps per patch (default 10; lower = faster, rougher).")
+    var timesteps: Int = 10
+
+    @Option(name: .long, help: "Classifier-free guidance scale (default 2.0).")
+    var cfg: Float = 2.0
+
+    @Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).")
+    var prepad: Double = 0.1
+
+    func run() async throws {
+        let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
+        log("Loading \(modelId) ...")
+        let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
+            log("  [\(Int(progress * 100))%] \(status)")
+        }
+        defer { model.unload() }
+
+        var refAudio: [Float]? = nil
+        if let reference {
+            let url = URL(fileURLWithPath: reference)
+            guard FileManager.default.fileExists(atPath: url.path) else {
+                throw ValidationError("Reference file not found: \(reference)")
+            }
+            // VoxCPM2 ingests reference audio at 16 kHz.
+            refAudio = try AudioFileLoader.load(url: url, targetSampleRate: 16000)
+            log("  Reference: \(refAudio?.count ?? 0) samples")
+        }
+
+        log("Synthesizing ...")
+        let audio = try await model.generateVoxCPM2(
+            text: text,
+            language: language,
+            refAudio: refAudio,
+            inferenceTimesteps: timesteps,
+            cfgValue: cfg
+        )
+        guard !audio.isEmpty else {
+            throw ValidationError("No audio was generated.")
+        }
+        let rate = model.sampleRate
+
+        // Prepend leading silence so the first phoneme isn't clipped on playback.
+        let padSamples = max(0, Int(prepad * Double(rate)))
+        let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio
+
+        if let output {
+            let url = URL(fileURLWithPath: output)
+            try WAVWriter.write(samples: out, sampleRate: rate, to: url)
+            log("Saved \(out.count) samples (\(seconds(out.count, rate))s) to \(output)")
+        } else {
+            try playThroughSpeakers(samples: out, sampleRate: rate)
+        }
+    }
+
+    // MARK: - Helpers
+
+    private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws {
+        let tmp = FileManager.default.temporaryDirectory
+            .appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
+        try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp)
+        defer { try? FileManager.default.removeItem(at: tmp) }
+
+        let proc = Process()
+        proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
+        proc.arguments = [tmp.path]
+        try proc.run()
+        proc.waitUntilExit()
+    }
+
+    private func seconds(_ count: Int, _ rate: Int) -> String {
+        String(format: "%.2f", Double(count) / Double(rate))
+    }
+
+    /// Progress/status goes to stderr so stdout stays clean for piping.
+    private func log(_ message: String) {
+        FileHandle.standardError.write(Data((message + "\n").utf8))
+    }
+}