diff --git a/Makefile b/Makefile index 01c89df..207145f 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,6 @@ SPEECH_SWIFT ?= ../speech-swift CONFIG ?= release -METALLIB = $(SPEECH_SWIFT)/.build/$(CONFIG)/mlx.metallib .PHONY: build run clean metallib install @@ -14,15 +13,21 @@ build: swift build -c $(CONFIG) @$(MAKE) --no-print-directory metallib -# Copy the prebuilt metallib next to our binary. If speech-swift hasn't built -# it yet, fall back to its build script. +# Copy the prebuilt metallib next to our binary. SwiftPM's real bin dir is +# triple-prefixed (e.g. .build/arm64-apple-macosx/release) and the +# .build/ symlink is not always present in speech-swift, so resolve +# both ends with `swift build --show-bin-path` instead of hardcoding the path. +# If speech-swift hasn't built the metallib yet, fall back to its build script. metallib: - @if [ ! -f "$(METALLIB)" ]; then \ + @ss_bin="$$(cd "$(SPEECH_SWIFT)" && swift build -c $(CONFIG) --show-bin-path)"; \ + if [ ! -f "$$ss_bin/mlx.metallib" ]; then \ echo "metallib missing in speech-swift — building it there..."; \ ( cd "$(SPEECH_SWIFT)" && swift build -c $(CONFIG) && ./scripts/build_mlx_metallib.sh $(CONFIG) ); \ - fi - @cp "$(METALLIB)" ".build/$(CONFIG)/mlx.metallib" - @echo "metallib in place: .build/$(CONFIG)/mlx.metallib" + ss_bin="$$(cd "$(SPEECH_SWIFT)" && swift build -c $(CONFIG) --show-bin-path)"; \ + fi; \ + our_bin="$$(swift build -c $(CONFIG) --show-bin-path)"; \ + cp "$$ss_bin/mlx.metallib" "$$our_bin/mlx.metallib"; \ + echo "metallib in place: $$our_bin/mlx.metallib" # Quick smoke test (default voice). run: build diff --git a/Package.swift b/Package.swift index aa9be96..9bdf605 100644 --- a/Package.swift +++ b/Package.swift @@ -7,6 +7,12 @@ let package = Package( dependencies: [ .package(path: "../speech-swift"), .package(url: "https://github.com/apple/swift-argument-parser", from: "1.5.0"), + // MLX core — used by `gpu-check` to query the Metal device and run a + // GPU compute probe. Same package speech-swift resolves (pinned 0.31.x). + .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.30.0"), + // Hummingbird — loopback HTTP server for the `serve` resident TTS daemon. + // Same range speech-swift pins; HTTP only, no WebSocket. + .package(url: "https://github.com/hummingbird-project/hummingbird.git", "2.5.0"..<"2.17.0"), ], targets: [ .executableTarget( @@ -15,6 +21,8 @@ let package = Package( .product(name: "VoxCPM2TTS", package: "speech-swift"), .product(name: "AudioCommon", package: "speech-swift"), .product(name: "ArgumentParser", package: "swift-argument-parser"), + .product(name: "MLX", package: "mlx-swift"), + .product(name: "Hummingbird", package: "hummingbird"), ], path: "Sources", swiftSettings: [.swiftLanguageMode(.v5)] diff --git a/Sources/GpuCheck.swift b/Sources/GpuCheck.swift new file mode 100644 index 0000000..3cf84e1 --- /dev/null +++ b/Sources/GpuCheck.swift @@ -0,0 +1,177 @@ +import ArgumentParser +import Dispatch +import Foundation +import MLX + +/// `madcat-say gpu-check` — confirm that MLX inference will run on the Apple +/// Silicon Metal GPU rather than silently falling back to CPU, and that the +/// compiled Metal shader library (`mlx.metallib`) is in place next to the +/// binary. +/// +/// What it checks: +/// 1. `GPU.deviceInfo()` reports an Apple GPU architecture (Metal present). +/// 2. `mlx.metallib` exists next to the executable. Missing => shaders are +/// JIT-compiled (~5x slower), or inference aborts with +/// "Failed to load the default metallib". +/// 3. A real matmul is forced to completion with `eval()`; the MLX +/// active-memory delta proves the work was allocated on the Metal GPU. +/// +/// Exit code: 0 = PASS (Apple Metal GPU is the compute device), 2 = FAIL. +struct GpuCheck: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "gpu-check", + abstract: "Confirm MLX is using the Apple Silicon Metal GPU (not CPU)." + ) + + @Flag(name: .long, help: "Emit machine-readable JSON instead of text.") + var json = false + + @Option(name: .long, help: "Edge length N of the NxN matmul probe (default 1024).") + var probe: Int = 1024 + + func run() async throws { + // 1. Metal device identity. Only a Metal device answers deviceInfo(); + // on Apple Silicon the architecture string contains "apple". + let info = GPU.deviceInfo() + let architecture = info.architecture + let isAppleGPU = architecture.lowercased().contains("apple") + let totalMemory = info.memorySize + let maxWorkingSet = Int(info.maxRecommendedWorkingSetSize) + + // 2. metallib presence (project convention: copied next to the binary). + let (metallibPath, metallibFound) = Self.metallibStatus() + + // 3. Live GPU compute probe: matmul -> eval(). The cold eval pays the + // Metal pipeline / metallib load; the warm eval is steady state. + let n = max(8, probe) + let activeBefore = Memory.activeMemory + + let a = ones([n, n]) + let b = ones([n, n]) + + let coldStart = DispatchTime.now() + var c = matmul(a, b) + eval(c) + let coldMs = Self.elapsedMs(since: coldStart) + + let warmStart = DispatchTime.now() + c = matmul(c, b) + eval(c) + let warmMs = Self.elapsedMs(since: warmStart) + + let checksum = c.sum().item(Float.self) + let memoryDelta = Memory.activeMemory - activeBefore + let pass = isAppleGPU + + let report = Report( + appleGPU: isAppleGPU, + architecture: architecture, + totalMemory: totalMemory, + maxWorkingSet: maxWorkingSet, + metallibPath: metallibPath, + metallibFound: metallibFound, + probe: n, + coldMs: coldMs, + warmMs: warmMs, + checksum: checksum, + memoryDelta: memoryDelta, + pass: pass + ) + + print(json ? report.json : report.text) + + if !pass { + throw ExitCode(2) + } + } + + // MARK: - Helpers + + /// MLX expects `mlx.metallib` next to the executable (see the Makefile, + /// which copies it out of speech-swift's build dir). + private static func metallibStatus() -> (path: String, found: Bool) { + let exe = Bundle.main.executableURL + ?? URL(fileURLWithPath: CommandLine.arguments.first ?? "madcat-say") + let dir = exe.resolvingSymlinksInPath().deletingLastPathComponent() + let candidate = dir.appendingPathComponent("mlx.metallib") + return (candidate.path, FileManager.default.fileExists(atPath: candidate.path)) + } + + private static func elapsedMs(since start: DispatchTime) -> Double { + Double(DispatchTime.now().uptimeNanoseconds &- start.uptimeNanoseconds) / 1_000_000 + } + + static func human(_ bytes: Int) -> String { + guard bytes > 0 else { return "0 B" } + let units = ["B", "KB", "MB", "GB", "TB"] + var value = Double(bytes) + var i = 0 + while value >= 1024 && i < units.count - 1 { + value /= 1024 + i += 1 + } + return String(format: "%.1f %@", value, units[i]) + } +} + +/// Result of a `gpu-check` run, with text + JSON renderings. +private struct Report { + let appleGPU: Bool + let architecture: String + let totalMemory: Int + let maxWorkingSet: Int + let metallibPath: String + let metallibFound: Bool + let probe: Int + let coldMs: Double + let warmMs: Double + let checksum: Float + let memoryDelta: Int + let pass: Bool + + var text: String { + var lines: [String] = [] + lines.append("madcat-say gpu-check") + lines.append("--------------------") + lines.append("Metal GPU : \(architecture) (Apple Silicon: \(appleGPU ? "YES" : "NO"))") + lines.append("Total memory : \(GpuCheck.human(totalMemory))") + lines.append("Max working set : \(GpuCheck.human(maxWorkingSet))") + lines.append("MLX metallib : \(metallibFound ? "found" : "MISSING") (\(metallibPath))") + if !metallibFound { + lines.append(" WARNING: shaders will JIT-compile (~5x slower) or fail to load.") + lines.append(" Fix: run `make` (copies speech-swift's mlx.metallib next to the binary).") + } + lines.append("Compute probe : \(probe)x\(probe) matmul on GPU") + lines.append(String(format: " cold eval : %.2f ms (Metal pipeline / metallib load)", coldMs)) + lines.append(String(format: " warm eval : %.2f ms", warmMs)) + lines.append(" GPU mem delta : \(GpuCheck.human(max(0, memoryDelta))) active") + lines.append(String(format: " checksum : %.0f", checksum)) + lines.append("VERDICT: " + (pass + ? "PASS - MLX inference will run on the Metal GPU." + : "FAIL - no Apple Metal GPU detected; inference would fall back to CPU.")) + return lines.joined(separator: "\n") + } + + var json: String { + let payload: [String: Any] = [ + "apple_gpu": appleGPU, + "architecture": architecture, + "total_memory_bytes": totalMemory, + "max_recommended_working_set_bytes": maxWorkingSet, + "metallib_path": metallibPath, + "metallib_found": metallibFound, + "probe_dim": probe, + "cold_eval_ms": coldMs, + "warm_eval_ms": warmMs, + "active_memory_delta_bytes": memoryDelta, + "checksum": Double(checksum), + "verdict": pass ? "PASS" : "FAIL" + ] + guard let data = try? JSONSerialization.data( + withJSONObject: payload, options: [.prettyPrinted, .sortedKeys] + ) else { + return "{\"verdict\":\"\(pass ? "PASS" : "FAIL")\"}" + } + return String(decoding: data, as: UTF8.self) + } +} diff --git a/Sources/MadcatSay.swift b/Sources/MadcatSay.swift index 4213226..0c35d7d 100644 --- a/Sources/MadcatSay.swift +++ b/Sources/MadcatSay.swift @@ -3,16 +3,30 @@ import AudioCommon import Foundation @preconcurrency import VoxCPM2TTS -/// madcat-say — speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal). +/// madcat-say — on-device voice-cloning TTS using VoxCPM2 (MLX/Metal). +/// +/// Root command. `speak` is the default subcommand, so the historical +/// `madcat-say "text"` form keeps working unchanged; `madcat-say gpu-check` +/// runs the Metal/GPU diagnostic. +@main +struct MadcatSay: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "madcat-say", + abstract: "On-device voice cloning with VoxCPM2 (MLX/Metal).", + subcommands: [Speak.self, GpuCheck.self, Serve.self], + defaultSubcommand: Speak.self + ) +} + +/// Speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal). /// /// madcat-say "Hello there" /// madcat-say -r samantha.wav "I was calibrated just for you." /// madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha." /// madcat-say -r samantha.wav -o out.wav "Saved instead of played." -@main -struct MadcatSay: AsyncParsableCommand { +struct Speak: AsyncParsableCommand { static let configuration = CommandConfiguration( - commandName: "madcat-say", + commandName: "speak", abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).", discussion: """ With no -r, uses the model's default voice. With -r it zero-shot clones @@ -46,7 +60,19 @@ struct MadcatSay: AsyncParsableCommand { @Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).") var prepad: Double = 0.1 + @Option(name: .long, help: "Resident daemon port to try first (default 8765).") + var daemonPort: Int = 8765 + + @Flag(name: .long, help: "Bypass the resident daemon; always load the model in-process.") + var noDaemon: Bool = false + func run() async throws { + // Fast path: forward to the resident `serve` daemon if it is up, so the + // model stays warm in GPU memory instead of being cold-loaded per call. + if !noDaemon, await trySpeakViaDaemon() { + return + } + let modelId = "aufklarer/VoxCPM2-MLX-\(variant)" log("Loading \(modelId) ...") let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in @@ -93,6 +119,68 @@ struct MadcatSay: AsyncParsableCommand { // MARK: - Helpers + /// Try to satisfy this request via the resident `serve` daemon. Returns true + /// if the daemon handled it (audio played or saved); false on any failure so + /// the caller falls back to an in-process model load. Never throws — a down + /// or broken daemon must degrade gracefully, not abort the command. + private func trySpeakViaDaemon() async -> Bool { + let base = "http://127.0.0.1:\(daemonPort)" + let session = URLSession(configuration: .ephemeral) + + // Health probe with a short timeout so a missing daemon costs ~nothing. + guard let healthURL = URL(string: base + "/health") else { return false } + var healthReq = URLRequest(url: healthURL) + healthReq.timeoutInterval = 0.6 + do { + let (_, resp) = try await session.data(for: healthReq) + guard let http = resp as? HTTPURLResponse, http.statusCode == 200 else { return false } + } catch { + return false + } + + // Build the synthesis request. + var payload: [String: Any] = ["input": text, "timesteps": timesteps, "cfg": Double(cfg), "prepad": prepad] + if let language { payload["language"] = language } + if let reference { payload["voice"] = (reference as NSString).expandingTildeInPath } + + guard let speakURL = URL(string: base + "/v1/audio/speech"), + let bodyData = try? JSONSerialization.data(withJSONObject: payload) else { + return false + } + var req = URLRequest(url: speakURL) + req.httpMethod = "POST" + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.httpBody = bodyData + req.timeoutInterval = 300 + + log("Using resident daemon on port \(daemonPort) ...") + do { + let (wav, resp) = try await session.data(for: req) + guard let http = resp as? HTTPURLResponse, http.statusCode == 200, !wav.isEmpty else { + log("Daemon returned no audio; falling back to in-process load.") + return false + } + if let output { + try wav.write(to: URL(fileURLWithPath: output)) + log("Saved \(wav.count) bytes to \(output)") + } else { + let tmp = FileManager.default.temporaryDirectory + .appendingPathComponent("madcat-say-\(UUID().uuidString).wav") + try wav.write(to: tmp) + defer { try? FileManager.default.removeItem(at: tmp) } + let proc = Process() + proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay") + proc.arguments = [tmp.path] + try proc.run() + proc.waitUntilExit() + } + return true + } catch { + log("Daemon request failed (\(error)); falling back to in-process load.") + return false + } + } + private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws { let tmp = FileManager.default.temporaryDirectory .appendingPathComponent("madcat-say-\(UUID().uuidString).wav") diff --git a/Sources/Serve.swift b/Sources/Serve.swift new file mode 100644 index 0000000..fa2516c --- /dev/null +++ b/Sources/Serve.swift @@ -0,0 +1,231 @@ +import ArgumentParser +import AudioCommon +import Foundation +import Hummingbird +import NIOCore +@preconcurrency import VoxCPM2TTS + +/// `madcat-say serve` — resident VoxCPM2 TTS daemon. +/// +/// Loads the model once, warms the Metal pipeline, and keeps it resident in GPU +/// memory, serving synthesis over a loopback HTTP API. This pays the multi-GB +/// cold model load exactly once (at startup) instead of on every `madcat-say` +/// invocation. The `speak` subcommand probes this daemon's `/health` and +/// forwards to it when up, falling back to an in-process load when it is not. +struct Serve: AsyncParsableCommand { + static let configuration = CommandConfiguration( + commandName: "serve", + abstract: "Run the resident VoxCPM2 TTS daemon (load once, stay warm)." + ) + + @Option(name: .long, help: "Host to bind (default: 127.0.0.1).") + var host: String = "127.0.0.1" + + @Option(name: .long, help: "Port to bind (default: 8765).") + var port: Int = 8765 + + @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.") + var variant: String = "int4" + + @Option(name: .long, help: "Default diffusion timesteps per patch (default 10).") + var timesteps: Int = 10 + + @Option(name: .long, help: "Default classifier-free guidance scale (default 2.0).") + var cfg: Float = 2.0 + + @Flag(name: .long, help: "Skip the warmup synthesis at startup.") + var noWarmup: Bool = false + + func run() async throws { + let engine = SynthEngine(variant: variant, defaultTimesteps: timesteps, defaultCfg: cfg) + let modelId = "aufklarer/VoxCPM2-MLX-\(variant)" + + log("Loading \(modelId) ...") + try await engine.load() + if !noWarmup { + log("Warming up GPU pipeline ...") + try await engine.warmup() + } + log("Model resident. Serving on http://\(host):\(port)") + log(" GET /health") + log(" POST /v1/audio/speech {input|text, voice?, language?, timesteps?, cfg?, prepad?}") + + let started = Date() + let router = Router() + + router.get("/health") { _, _ -> Response in + let ready = await engine.isReady + let uptime = Int(Date().timeIntervalSince(started)) + let json = "{\"status\":\"\(ready ? "ok" : "loading")\",\"model\":\"\(modelId)\",\"ready\":\(ready),\"uptime_s\":\(uptime)}" + return Response( + status: ready ? .ok : .serviceUnavailable, + headers: [.contentType: "application/json"], + body: .init(byteBuffer: .init(string: json))) + } + + router.post("/v1/audio/speech") { request, _ -> Response in + let body = try await request.body.collect(upTo: 1 << 20) + let req = SpeechRequest.parse(body) + guard let text = req.text, !text.isEmpty else { + return Self.errorResponse("Missing 'input' (or 'text') field.", status: .badRequest) + } + do { + let result = try await engine.synthesize( + text: text, + language: req.language, + referencePath: req.voice, + timesteps: req.timesteps, + cfg: req.cfg, + prepad: req.prepad ?? 0.1) + let wav = try Self.encodeWAV(samples: result.samples, sampleRate: result.sampleRate) + return Response( + status: .ok, + headers: [.contentType: "audio/wav"], + body: .init(byteBuffer: .init(data: wav))) + } catch { + return Self.errorResponse("Synthesis failed: \(error)", status: .internalServerError) + } + } + + let app = Application( + router: router, + configuration: .init(address: .hostname(host, port: port))) + try await app.run() + } + + // MARK: - Helpers + + static func encodeWAV(samples: [Float], sampleRate: Int) throws -> Data { + let tmp = FileManager.default.temporaryDirectory + .appendingPathComponent("madcat-say-serve-\(UUID().uuidString).wav") + try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp) + defer { try? FileManager.default.removeItem(at: tmp) } + return try Data(contentsOf: tmp) + } + + static func errorResponse(_ message: String, status: HTTPResponse.Status) -> Response { + let data = (try? JSONSerialization.data(withJSONObject: ["error": message])) ?? Data() + return Response( + status: status, + headers: [.contentType: "application/json"], + body: .init(byteBuffer: .init(data: data))) + } + + func log(_ message: String) { + FileHandle.standardError.write(Data((message + "\n").utf8)) + } +} + +// MARK: - Request model + +/// Parsed `/v1/audio/speech` request. OpenAI-ish: `input` is the text, `voice` +/// is a reference WAV path (loopback / same host) for zero-shot cloning. +struct SpeechRequest { + var text: String? + var voice: String? + var language: String? + var timesteps: Int? + var cfg: Float? + var prepad: Double? + + static func parse(_ body: ByteBuffer) -> SpeechRequest { + var req = SpeechRequest() + let data = Data(buffer: body) + guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { + return req + } + req.text = (json["input"] as? String) ?? (json["text"] as? String) + req.voice = json["voice"] as? String + req.language = json["language"] as? String + req.timesteps = json["timesteps"] as? Int + if let c = json["cfg"] as? Double { req.cfg = Float(c) } + req.prepad = json["prepad"] as? Double + return req + } +} + +// MARK: - Resident synthesis engine + +/// Holds the loaded model and serializes single-GPU access. Caches the most +/// recently used reference voice so repeated clone calls skip the re-decode. +actor SynthEngine { + private var model: VoxCPM2TTSModel? + private let modelId: String + private let defaultTimesteps: Int + private let defaultCfg: Float + private var cachedReferencePath: String? + private var cachedReference: [Float]? + + init(variant: String, defaultTimesteps: Int, defaultCfg: Float) { + self.modelId = "aufklarer/VoxCPM2-MLX-\(variant)" + self.defaultTimesteps = defaultTimesteps + self.defaultCfg = defaultCfg + } + + var isReady: Bool { model != nil } + + func load() async throws { + if model != nil { return } + let m = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in + FileHandle.standardError.write(Data(" [\(Int(progress * 100))%] \(status)\n".utf8)) + } + model = m + } + + func warmup() async throws { + guard let m = model else { return } + _ = try await m.generateVoxCPM2( + text: "Warming up.", + language: nil, + refAudio: nil, + inferenceTimesteps: defaultTimesteps, + cfgValue: defaultCfg) + } + + struct Result { let samples: [Float]; let sampleRate: Int } + + func synthesize( + text: String, + language: String?, + referencePath: String?, + timesteps: Int?, + cfg: Float?, + prepad: Double + ) async throws -> Result { + if model == nil { try await load() } + guard let m = model else { + throw ValidationError("Model failed to load.") + } + + var refAudio: [Float]? + if let path = referencePath, !path.isEmpty { + if path == cachedReferencePath, let cached = cachedReference { + refAudio = cached + } else { + let url = URL(fileURLWithPath: (path as NSString).expandingTildeInPath) + guard FileManager.default.fileExists(atPath: url.path) else { + throw ValidationError("Reference file not found: \(path)") + } + let loaded = try AudioFileLoader.load(url: url, targetSampleRate: 16000) + cachedReferencePath = path + cachedReference = loaded + refAudio = loaded + } + } + + let audio = try await m.generateVoxCPM2( + text: text, + language: language, + refAudio: refAudio, + inferenceTimesteps: timesteps ?? defaultTimesteps, + cfgValue: cfg ?? defaultCfg) + guard !audio.isEmpty else { + throw ValidationError("No audio was generated.") + } + + let rate = m.sampleRate + let padSamples = max(0, Int(prepad * Double(rate))) + let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio + return Result(samples: out, sampleRate: rate) + } +}