commit 67be877f7f3c943f5229e1857c71c9251dcb6722 Author: marauder-actual Date: Sun May 31 14:34:59 2026 +0200 initial commit On-device voice cloning CLI using VoxCPM2 via MLX on macOS. Swift 6, macOS 15+, depends on speech-swift (local path dep). diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6befdd --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.build/ +.swiftpm/ +*.wav +*.metallib +.DS_Store diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..01c89df --- /dev/null +++ b/Makefile @@ -0,0 +1,40 @@ +# madcat-say — VoxCPM2 voice-cloning CLI (on-device, MLX/Metal) +# +# MLX needs a compiled Metal shader library (mlx.metallib) sitting next to the +# binary, or inference crashes with "Failed to load the default metallib". +# We reuse the one speech-swift already builds. + +SPEECH_SWIFT ?= ../speech-swift +CONFIG ?= release +METALLIB = $(SPEECH_SWIFT)/.build/$(CONFIG)/mlx.metallib + +.PHONY: build run clean metallib install + +build: + swift build -c $(CONFIG) + @$(MAKE) --no-print-directory metallib + +# Copy the prebuilt metallib next to our binary. If speech-swift hasn't built +# it yet, fall back to its build script. +metallib: + @if [ ! -f "$(METALLIB)" ]; then \ + echo "metallib missing in speech-swift — building it there..."; \ + ( cd "$(SPEECH_SWIFT)" && swift build -c $(CONFIG) && ./scripts/build_mlx_metallib.sh $(CONFIG) ); \ + fi + @cp "$(METALLIB)" ".build/$(CONFIG)/mlx.metallib" + @echo "metallib in place: .build/$(CONFIG)/mlx.metallib" + +# Quick smoke test (default voice). +run: build + .build/$(CONFIG)/madcat-say "Hello. This is madcat say, running on device." + +# Install to ~/.local/bin (binary + metallib alongside it). +install: build + @mkdir -p $(HOME)/.local/bin + cp ".build/$(CONFIG)/madcat-say" "$(HOME)/.local/bin/madcat-say" + cp ".build/$(CONFIG)/mlx.metallib" "$(HOME)/.local/bin/mlx.metallib" + @echo "installed to ~/.local/bin/madcat-say (with mlx.metallib)" + +clean: + swift package clean + rm -rf .build diff --git a/Package.resolved b/Package.resolved new file mode 100644 index 0000000..7a58023 --- /dev/null +++ b/Package.resolved @@ -0,0 +1,312 @@ +{ + "originHash" : "2e9e6347e6907ec0b7d6e2626c125e2264b2d5c17d1e099cb4b69ec870ceab21", + "pins" : [ + { + "identity" : "async-http-client", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/async-http-client.git", + "state" : { + "revision" : "3a5b74a58782c3b4c1f0bc75e9b67b10c2494e8f", + "version" : "1.33.1" + } + }, + { + "identity" : "compress-nio", + "kind" : "remoteSourceControl", + "location" : "https://github.com/adam-fowler/compress-nio.git", + "state" : { + "revision" : "e1caa19077dda4b00441142ef57da3db02acd466", + "version" : "1.4.2" + } + }, + { + "identity" : "eventsource", + "kind" : "remoteSourceControl", + "location" : "https://github.com/mattt/EventSource.git", + "state" : { + "revision" : "a3a85a85214caf642abaa96ae664e4c772a59f6e", + "version" : "1.4.1" + } + }, + { + "identity" : "hummingbird", + "kind" : "remoteSourceControl", + "location" : "https://github.com/hummingbird-project/hummingbird.git", + "state" : { + "revision" : "3ae359b1bb1e72378ed43b59fdcd4d44cac5d7a4", + "version" : "2.16.0" + } + }, + { + "identity" : "hummingbird-websocket", + "kind" : "remoteSourceControl", + "location" : "https://github.com/hummingbird-project/hummingbird-websocket.git", + "state" : { + "revision" : "716c54294152c6d3301a6239a1d74db57cbcd6dc", + "version" : "2.6.0" + } + }, + { + "identity" : "mlx-swift", + "kind" : "remoteSourceControl", + "location" : "https://github.com/ml-explore/mlx-swift", + "state" : { + "revision" : "61b9e011e09a62b489f6bd647958f1555bdf2896", + "version" : "0.31.3" + } + }, + { + "identity" : "swift-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-algorithms.git", + "state" : { + "revision" : "87e50f483c54e6efd60e885f7f5aa946cee68023", + "version" : "1.2.1" + } + }, + { + "identity" : "swift-argument-parser", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-argument-parser", + "state" : { + "revision" : "ca37474853a4b5f59a22c74bfdd449b1f6bc4cc2", + "version" : "1.8.1" + } + }, + { + "identity" : "swift-asn1", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-asn1.git", + "state" : { + "revision" : "eb50cbd14606a9161cbc5d452f18797c90ef0bab", + "version" : "1.7.0" + } + }, + { + "identity" : "swift-async-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-async-algorithms.git", + "state" : { + "revision" : "d0b4a06d0f173a2f3be27d3ea21b3c3aa18db440", + "version" : "1.1.4" + } + }, + { + "identity" : "swift-atomics", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-atomics.git", + "state" : { + "revision" : "b601256eab081c0f92f059e12818ac1d4f178ff7", + "version" : "1.3.0" + } + }, + { + "identity" : "swift-certificates", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-certificates.git", + "state" : { + "revision" : "bde8ca32a096825dfce37467137c903418c1893d", + "version" : "1.19.1" + } + }, + { + "identity" : "swift-collections", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-collections.git", + "state" : { + "revision" : "fea17c02d767f46b23070fdfdacc28a03a39232a", + "version" : "1.5.1" + } + }, + { + "identity" : "swift-configuration", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-configuration.git", + "state" : { + "revision" : "be76c4ad929eb6c4bcaf3351799f2adf9e6848a9", + "version" : "1.2.0" + } + }, + { + "identity" : "swift-crypto", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-crypto.git", + "state" : { + "revision" : "1b6b2e274e85105bfa155183145a1dcfd63331f1", + "version" : "4.5.0" + } + }, + { + "identity" : "swift-distributed-tracing", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-distributed-tracing.git", + "state" : { + "revision" : "dc4030184203ffafbb2ec614352487235d747fe0", + "version" : "1.4.1" + } + }, + { + "identity" : "swift-http-structured-headers", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-http-structured-headers.git", + "state" : { + "revision" : "933538faa42c432d385f02e07df0ace7c5ecfc47", + "version" : "1.7.0" + } + }, + { + "identity" : "swift-http-types", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-http-types.git", + "state" : { + "revision" : "45eb0224913ea070ec4fba17291b9e7ecf4749ca", + "version" : "1.5.1" + } + }, + { + "identity" : "swift-huggingface", + "kind" : "remoteSourceControl", + "location" : "https://github.com/huggingface/swift-huggingface.git", + "state" : { + "revision" : "b721959445b617d0bf03910b2b4aced345fd93bf", + "version" : "0.9.0" + } + }, + { + "identity" : "swift-jinja", + "kind" : "remoteSourceControl", + "location" : "https://github.com/huggingface/swift-jinja.git", + "state" : { + "revision" : "0b67ecb79139f6addef8699eff3622808aa6c7dc", + "version" : "2.3.6" + } + }, + { + "identity" : "swift-log", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-log.git", + "state" : { + "revision" : "2aed77ae5ec9a86d8fe42c12275e4c2653a286ee", + "version" : "1.13.1" + } + }, + { + "identity" : "swift-metrics", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-metrics.git", + "state" : { + "revision" : "087e8074afa97040c3b870c8664fe5482fb87cc4", + "version" : "2.11.0" + } + }, + { + "identity" : "swift-nio", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio.git", + "state" : { + "revision" : "57c0a08a331aaea9f5d7a932ad94ef43be942a95", + "version" : "2.100.0" + } + }, + { + "identity" : "swift-nio-extras", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio-extras.git", + "state" : { + "revision" : "d2eeec0339074034f11a040a74aa2a341a2c4506", + "version" : "1.34.1" + } + }, + { + "identity" : "swift-nio-http2", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio-http2.git", + "state" : { + "revision" : "61d1b44f6e4e118792be1cff88ee2bc0267c6f9a", + "version" : "1.44.0" + } + }, + { + "identity" : "swift-nio-ssl", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio-ssl.git", + "state" : { + "revision" : "3f337058ccd7243c4cac7911477d8ad4c598d4da", + "version" : "2.37.0" + } + }, + { + "identity" : "swift-nio-transport-services", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-nio-transport-services.git", + "state" : { + "revision" : "67787bb645a5e67d2edcdfbe48a216cc549222d5", + "version" : "1.28.0" + } + }, + { + "identity" : "swift-numerics", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-numerics.git", + "state" : { + "revision" : "0c0290ff6b24942dadb83a929ffaaa1481df04a2", + "version" : "1.1.1" + } + }, + { + "identity" : "swift-service-context", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-service-context.git", + "state" : { + "revision" : "d0997351b0c7779017f88e7a93bc30a1878d7f29", + "version" : "1.3.0" + } + }, + { + "identity" : "swift-service-lifecycle", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/swift-service-lifecycle.git", + "state" : { + "revision" : "9829955b385e5bb88128b73f1b8389e9b9c3191a", + "version" : "2.11.0" + } + }, + { + "identity" : "swift-system", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-system.git", + "state" : { + "revision" : "7c6ad0fc39d0763e0b699210e4124afd5041c5df", + "version" : "1.6.4" + } + }, + { + "identity" : "swift-transformers", + "kind" : "remoteSourceControl", + "location" : "https://github.com/huggingface/swift-transformers", + "state" : { + "revision" : "2fa33e1f5e7131a7fc64c28e6d161dcec0d24820", + "version" : "1.3.3" + } + }, + { + "identity" : "swift-websocket", + "kind" : "remoteSourceControl", + "location" : "https://github.com/hummingbird-project/swift-websocket.git", + "state" : { + "revision" : "126df9655565068bd97838c072c1db11f9fd42ee", + "version" : "1.6.1" + } + }, + { + "identity" : "yyjson", + "kind" : "remoteSourceControl", + "location" : "https://github.com/ibireme/yyjson.git", + "state" : { + "revision" : "8b4a38dc994a110abaec8a400615567bd996105f", + "version" : "0.12.0" + } + } + ], + "version" : 3 +} diff --git a/Package.swift b/Package.swift new file mode 100644 index 0000000..aa9be96 --- /dev/null +++ b/Package.swift @@ -0,0 +1,23 @@ +// swift-tools-version: 6.0 +import PackageDescription + +let package = Package( + name: "madcat-say", + platforms: [.macOS(.v15)], + dependencies: [ + .package(path: "../speech-swift"), + .package(url: "https://github.com/apple/swift-argument-parser", from: "1.5.0"), + ], + targets: [ + .executableTarget( + name: "madcat-say", + dependencies: [ + .product(name: "VoxCPM2TTS", package: "speech-swift"), + .product(name: "AudioCommon", package: "speech-swift"), + .product(name: "ArgumentParser", package: "swift-argument-parser"), + ], + path: "Sources", + swiftSettings: [.swiftLanguageMode(.v5)] + ), + ] +) diff --git a/Sources/MadcatSay.swift b/Sources/MadcatSay.swift new file mode 100644 index 0000000..4213226 --- /dev/null +++ b/Sources/MadcatSay.swift @@ -0,0 +1,117 @@ +import ArgumentParser +import AudioCommon +import Foundation +@preconcurrency import VoxCPM2TTS + +/// madcat-say — speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal). +/// +/// madcat-say "Hello there" +/// madcat-say -r samantha.wav "I was calibrated just for you." +/// madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha." +/// madcat-say -r samantha.wav -o out.wav "Saved instead of played." +@main +struct MadcatSay: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "madcat-say", + abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).", + discussion: """ + With no -r, uses the model's default voice. With -r it zero-shot clones + the reference voice. Language is auto-detected from the text; -l is an + optional hint. By default the audio is played through the speakers; pass + -o to write a 48 kHz WAV instead. + """ + ) + + @Argument(help: "The text to speak.") + var text: String + + @Option(name: [.customShort("r"), .long], help: "Reference voice WAV to clone (any sample rate).") + var reference: String? + + @Option(name: [.customShort("l"), .long], help: "Language hint (auto-detected if omitted).") + var language: String? + + @Option(name: [.customShort("o"), .long], help: "Write WAV here instead of playing it.") + var output: String? + + @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.") + var variant: String = "int4" + + @Option(name: .long, help: "Diffusion timesteps per patch (default 10; lower = faster, rougher).") + var timesteps: Int = 10 + + @Option(name: .long, help: "Classifier-free guidance scale (default 2.0).") + var cfg: Float = 2.0 + + @Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).") + var prepad: Double = 0.1 + + func run() async throws { + let modelId = "aufklarer/VoxCPM2-MLX-\(variant)" + log("Loading \(modelId) ...") + let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in + log(" [\(Int(progress * 100))%] \(status)") + } + defer { model.unload() } + + var refAudio: [Float]? = nil + if let reference { + let url = URL(fileURLWithPath: reference) + guard FileManager.default.fileExists(atPath: url.path) else { + throw ValidationError("Reference file not found: \(reference)") + } + // VoxCPM2 ingests reference audio at 16 kHz. + refAudio = try AudioFileLoader.load(url: url, targetSampleRate: 16000) + log(" Reference: \(refAudio?.count ?? 0) samples") + } + + log("Synthesizing ...") + let audio = try await model.generateVoxCPM2( + text: text, + language: language, + refAudio: refAudio, + inferenceTimesteps: timesteps, + cfgValue: cfg + ) + guard !audio.isEmpty else { + throw ValidationError("No audio was generated.") + } + let rate = model.sampleRate + + // Prepend leading silence so the first phoneme isn't clipped on playback. + let padSamples = max(0, Int(prepad * Double(rate))) + let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio + + if let output { + let url = URL(fileURLWithPath: output) + try WAVWriter.write(samples: out, sampleRate: rate, to: url) + log("Saved \(out.count) samples (\(seconds(out.count, rate))s) to \(output)") + } else { + try playThroughSpeakers(samples: out, sampleRate: rate) + } + } + + // MARK: - Helpers + + private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws { + let tmp = FileManager.default.temporaryDirectory + .appendingPathComponent("madcat-say-\(UUID().uuidString).wav") + try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp) + defer { try? FileManager.default.removeItem(at: tmp) } + + let proc = Process() + proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay") + proc.arguments = [tmp.path] + try proc.run() + proc.waitUntilExit() + } + + private func seconds(_ count: Int, _ rate: Int) -> String { + String(format: "%.2f", Double(count) / Double(rate)) + } + + /// Progress/status goes to stderr so stdout stays clean for piping. + private func log(_ message: String) { + FileHandle.standardError.write(Data((message + "\n").utf8)) + } +}