madcat-say/Sources/Serve.swift

import ArgumentParser
import AudioCommon
import Foundation
import Hummingbird
import NIOCore
@preconcurrency import VoxCPM2TTS

/// `madcat-say serve` — resident VoxCPM2 TTS daemon.
///
/// Loads the model once, warms the Metal pipeline, and keeps it resident in GPU
/// memory, serving synthesis over a loopback HTTP API. This pays the multi-GB
/// cold model load exactly once (at startup) instead of on every `madcat-say`
/// invocation. The `speak` subcommand probes this daemon's `/health` and
/// forwards to it when up, falling back to an in-process load when it is not.
struct Serve: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        commandName: "serve",
        abstract: "Run the resident VoxCPM2 TTS daemon (load once, stay warm)."
    )

    @Option(name: .long, help: "Host to bind (default: 127.0.0.1).")
    var host: String = "127.0.0.1"

    @Option(name: .long, help: "Port to bind (default: 8765).")
    var port: Int = 8765

    @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.")
    var variant: String = "int4"

    @Option(name: .long, help: "Default diffusion timesteps per patch (default 10).")
    var timesteps: Int = 10

    @Option(name: .long, help: "Default classifier-free guidance scale (default 2.0).")
    var cfg: Float = 2.0

    @Flag(name: .long, help: "Skip the warmup synthesis at startup.")
    var noWarmup: Bool = false

    func run() async throws {
        let engine = SynthEngine(variant: variant, defaultTimesteps: timesteps, defaultCfg: cfg)
        let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"

        log("Loading \(modelId) ...")
        try await engine.load()
        if !noWarmup {
            log("Warming up GPU pipeline ...")
            try await engine.warmup()
        }
        log("Model resident. Serving on http://\(host):\(port)")
        log("  GET  /health")
        log("  POST /v1/audio/speech   {input|text, voice?, language?, timesteps?, cfg?, prepad?}")

        let started = Date()
        let router = Router()

        router.get("/health") { _, _ -> Response in
            let ready = await engine.isReady
            let uptime = Int(Date().timeIntervalSince(started))
            let json = "{\"status\":\"\(ready ? "ok" : "loading")\",\"model\":\"\(modelId)\",\"ready\":\(ready),\"uptime_s\":\(uptime)}"
            return Response(
                status: ready ? .ok : .serviceUnavailable,
                headers: [.contentType: "application/json"],
                body: .init(byteBuffer: .init(string: json)))
        }

        router.post("/v1/audio/speech") { request, _ -> Response in
            let body = try await request.body.collect(upTo: 1 << 20)
            let req = SpeechRequest.parse(body)
            guard let text = req.text, !text.isEmpty else {
                return Self.errorResponse("Missing 'input' (or 'text') field.", status: .badRequest)
            }
            do {
                let result = try await engine.synthesize(
                    text: text,
                    language: req.language,
                    referencePath: req.voice,
                    timesteps: req.timesteps,
                    cfg: req.cfg,
                    prepad: req.prepad ?? 0.1)
                let wav = try Self.encodeWAV(samples: result.samples, sampleRate: result.sampleRate)
                return Response(
                    status: .ok,
                    headers: [.contentType: "audio/wav"],
                    body: .init(byteBuffer: .init(data: wav)))
            } catch {
                return Self.errorResponse("Synthesis failed: \(error)", status: .internalServerError)
            }
        }

        let app = Application(
            router: router,
            configuration: .init(address: .hostname(host, port: port)))
        try await app.run()
    }

    // MARK: - Helpers

    static func encodeWAV(samples: [Float], sampleRate: Int) throws -> Data {
        let tmp = FileManager.default.temporaryDirectory
            .appendingPathComponent("madcat-say-serve-\(UUID().uuidString).wav")
        try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp)
        defer { try? FileManager.default.removeItem(at: tmp) }
        return try Data(contentsOf: tmp)
    }

    static func errorResponse(_ message: String, status: HTTPResponse.Status) -> Response {
        let data = (try? JSONSerialization.data(withJSONObject: ["error": message])) ?? Data()
        return Response(
            status: status,
            headers: [.contentType: "application/json"],
            body: .init(byteBuffer: .init(data: data)))
    }

    func log(_ message: String) {
        FileHandle.standardError.write(Data((message + "\n").utf8))
    }
}

// MARK: - Request model

/// Parsed `/v1/audio/speech` request. OpenAI-ish: `input` is the text, `voice`
/// is a reference WAV path (loopback / same host) for zero-shot cloning.
struct SpeechRequest {
    var text: String?
    var voice: String?
    var language: String?
    var timesteps: Int?
    var cfg: Float?
    var prepad: Double?

    static func parse(_ body: ByteBuffer) -> SpeechRequest {
        var req = SpeechRequest()
        let data = Data(buffer: body)
        guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else {
            return req
        }
        req.text = (json["input"] as? String) ?? (json["text"] as? String)
        req.voice = json["voice"] as? String
        req.language = json["language"] as? String
        req.timesteps = json["timesteps"] as? Int
        if let c = json["cfg"] as? Double { req.cfg = Float(c) }
        req.prepad = json["prepad"] as? Double
        return req
    }
}

// MARK: - Resident synthesis engine

/// Holds the loaded model and serializes single-GPU access. Caches the most
/// recently used reference voice so repeated clone calls skip the re-decode.
actor SynthEngine {
    private var model: VoxCPM2TTSModel?
    private let modelId: String
    private let defaultTimesteps: Int
    private let defaultCfg: Float
    private var cachedReferencePath: String?
    private var cachedReference: [Float]?

    init(variant: String, defaultTimesteps: Int, defaultCfg: Float) {
        self.modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
        self.defaultTimesteps = defaultTimesteps
        self.defaultCfg = defaultCfg
    }

    var isReady: Bool { model != nil }

    func load() async throws {
        if model != nil { return }
        let m = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
            FileHandle.standardError.write(Data("  [\(Int(progress * 100))%] \(status)\n".utf8))
        }
        model = m
    }

    func warmup() async throws {
        guard let m = model else { return }
        _ = try await m.generateVoxCPM2(
            text: "Warming up.",
            language: nil,
            refAudio: nil,
            inferenceTimesteps: defaultTimesteps,
            cfgValue: defaultCfg)
    }

    struct Result { let samples: [Float]; let sampleRate: Int }

    func synthesize(
        text: String,
        language: String?,
        referencePath: String?,
        timesteps: Int?,
        cfg: Float?,
        prepad: Double
    ) async throws -> Result {
        if model == nil { try await load() }
        guard let m = model else {
            throw ValidationError("Model failed to load.")
        }

        var refAudio: [Float]?
        if let path = referencePath, !path.isEmpty {
            if path == cachedReferencePath, let cached = cachedReference {
                refAudio = cached
            } else {
                let url = URL(fileURLWithPath: (path as NSString).expandingTildeInPath)
                guard FileManager.default.fileExists(atPath: url.path) else {
                    throw ValidationError("Reference file not found: \(path)")
                }
                let loaded = try AudioFileLoader.load(url: url, targetSampleRate: 16000)
                cachedReferencePath = path
                cachedReference = loaded
                refAudio = loaded
            }
        }

        let audio = try await m.generateVoxCPM2(
            text: text,
            language: language,
            refAudio: refAudio,
            inferenceTimesteps: timesteps ?? defaultTimesteps,
            cfgValue: cfg ?? defaultCfg)
        guard !audio.isEmpty else {
            throw ValidationError("No audio was generated.")
        }

        let rate = m.sampleRate
        let padSamples = max(0, Int(prepad * Double(rate)))
        let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio
        return Result(samples: out, sampleRate: rate)
    }
}