initial commit
On-device voice cloning CLI using VoxCPM2 via MLX on macOS. Swift 6, macOS 15+, depends on speech-swift (local path dep).
This commit is contained in:
@@ -0,0 +1,117 @@
|
||||
import ArgumentParser
|
||||
import AudioCommon
|
||||
import Foundation
|
||||
@preconcurrency import VoxCPM2TTS
|
||||
|
||||
/// madcat-say — speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
|
||||
///
|
||||
/// madcat-say "Hello there"
|
||||
/// madcat-say -r samantha.wav "I was calibrated just for you."
|
||||
/// madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha."
|
||||
/// madcat-say -r samantha.wav -o out.wav "Saved instead of played."
|
||||
@main
|
||||
struct MadcatSay: AsyncParsableCommand {
|
||||
static let configuration = CommandConfiguration(
|
||||
commandName: "madcat-say",
|
||||
abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).",
|
||||
discussion: """
|
||||
With no -r, uses the model's default voice. With -r it zero-shot clones
|
||||
the reference voice. Language is auto-detected from the text; -l is an
|
||||
optional hint. By default the audio is played through the speakers; pass
|
||||
-o to write a 48 kHz WAV instead.
|
||||
"""
|
||||
)
|
||||
|
||||
@Argument(help: "The text to speak.")
|
||||
var text: String
|
||||
|
||||
@Option(name: [.customShort("r"), .long], help: "Reference voice WAV to clone (any sample rate).")
|
||||
var reference: String?
|
||||
|
||||
@Option(name: [.customShort("l"), .long], help: "Language hint (auto-detected if omitted).")
|
||||
var language: String?
|
||||
|
||||
@Option(name: [.customShort("o"), .long], help: "Write WAV here instead of playing it.")
|
||||
var output: String?
|
||||
|
||||
@Option(name: .long, help: "Model variant: int4 (default), int8, bf16.")
|
||||
var variant: String = "int4"
|
||||
|
||||
@Option(name: .long, help: "Diffusion timesteps per patch (default 10; lower = faster, rougher).")
|
||||
var timesteps: Int = 10
|
||||
|
||||
@Option(name: .long, help: "Classifier-free guidance scale (default 2.0).")
|
||||
var cfg: Float = 2.0
|
||||
|
||||
@Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).")
|
||||
var prepad: Double = 0.1
|
||||
|
||||
func run() async throws {
|
||||
let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
|
||||
log("Loading \(modelId) ...")
|
||||
let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
|
||||
log(" [\(Int(progress * 100))%] \(status)")
|
||||
}
|
||||
defer { model.unload() }
|
||||
|
||||
var refAudio: [Float]? = nil
|
||||
if let reference {
|
||||
let url = URL(fileURLWithPath: reference)
|
||||
guard FileManager.default.fileExists(atPath: url.path) else {
|
||||
throw ValidationError("Reference file not found: \(reference)")
|
||||
}
|
||||
// VoxCPM2 ingests reference audio at 16 kHz.
|
||||
refAudio = try AudioFileLoader.load(url: url, targetSampleRate: 16000)
|
||||
log(" Reference: \(refAudio?.count ?? 0) samples")
|
||||
}
|
||||
|
||||
log("Synthesizing ...")
|
||||
let audio = try await model.generateVoxCPM2(
|
||||
text: text,
|
||||
language: language,
|
||||
refAudio: refAudio,
|
||||
inferenceTimesteps: timesteps,
|
||||
cfgValue: cfg
|
||||
)
|
||||
guard !audio.isEmpty else {
|
||||
throw ValidationError("No audio was generated.")
|
||||
}
|
||||
let rate = model.sampleRate
|
||||
|
||||
// Prepend leading silence so the first phoneme isn't clipped on playback.
|
||||
let padSamples = max(0, Int(prepad * Double(rate)))
|
||||
let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio
|
||||
|
||||
if let output {
|
||||
let url = URL(fileURLWithPath: output)
|
||||
try WAVWriter.write(samples: out, sampleRate: rate, to: url)
|
||||
log("Saved \(out.count) samples (\(seconds(out.count, rate))s) to \(output)")
|
||||
} else {
|
||||
try playThroughSpeakers(samples: out, sampleRate: rate)
|
||||
}
|
||||
}
|
||||
|
||||
// MARK: - Helpers
|
||||
|
||||
private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws {
|
||||
let tmp = FileManager.default.temporaryDirectory
|
||||
.appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
|
||||
try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp)
|
||||
defer { try? FileManager.default.removeItem(at: tmp) }
|
||||
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
|
||||
proc.arguments = [tmp.path]
|
||||
try proc.run()
|
||||
proc.waitUntilExit()
|
||||
}
|
||||
|
||||
private func seconds(_ count: Int, _ rate: Int) -> String {
|
||||
String(format: "%.2f", Double(count) / Double(rate))
|
||||
}
|
||||
|
||||
/// Progress/status goes to stderr so stdout stays clean for piping.
|
||||
private func log(_ message: String) {
|
||||
FileHandle.standardError.write(Data((message + "\n").utf8))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user