import ArgumentParser
import AudioCommon
import Foundation
@preconcurrency import VoxCPM2TTS

/// madcat-say — on-device voice-cloning TTS using VoxCPM2 (MLX/Metal).
///
/// Root command. `speak` is the default subcommand, so the historical
/// `madcat-say "text"` form keeps working unchanged; `madcat-say gpu-check`
/// runs the Metal/GPU diagnostic.
@main
struct MadcatSay: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        commandName: "madcat-say",
        abstract: "On-device voice cloning with VoxCPM2 (MLX/Metal).",
        subcommands: [Speak.self, GpuCheck.self, Serve.self],
        defaultSubcommand: Speak.self
    )
}

/// Speak text in a (optionally cloned) voice using VoxCPM2 on-device (MLX/Metal).
///
///   madcat-say "Hello there"
///   madcat-say -r samantha.wav "I was calibrated just for you."
///   madcat-say -r samantha.wav -l polish "Cześć, jestem Samantha."
///   madcat-say -r samantha.wav -o out.wav "Saved instead of played."
struct Speak: AsyncParsableCommand {
    static let configuration = CommandConfiguration(
        commandName: "speak",
        abstract: "Speak text in a cloned voice using VoxCPM2 (on-device, MLX/Metal).",
        discussion: """
        With no -r, uses the model's default voice. With -r it zero-shot clones
        the reference voice. Language is auto-detected from the text; -l is an
        optional hint. By default the audio is played through the speakers; pass
        -o to write a 48 kHz WAV instead.
        """
    )

    @Argument(help: "The text to speak.")
    var text: String

    @Option(name: [.customShort("r"), .long], help: "Reference voice WAV to clone (any sample rate).")
    var reference: String?

    @Option(name: [.customShort("l"), .long], help: "Language hint (auto-detected if omitted).")
    var language: String?

    @Option(name: [.customShort("o"), .long], help: "Write WAV here instead of playing it.")
    var output: String?

    @Option(name: .long, help: "Model variant: int4 (default), int8, bf16.")
    var variant: String = "int4"

    @Option(name: .long, help: "Diffusion timesteps per patch (default 10; lower = faster, rougher).")
    var timesteps: Int = 10

    @Option(name: .long, help: "Classifier-free guidance scale (default 2.0).")
    var cfg: Float = 2.0

    @Option(name: .long, help: "Seconds of silence prepended to avoid a clipped start (default 0.1).")
    var prepad: Double = 0.1

    @Option(name: .long, help: "Resident daemon port to try first (default 8765).")
    var daemonPort: Int = 8765

    @Flag(name: .long, help: "Bypass the resident daemon; always load the model in-process.")
    var noDaemon: Bool = false

    func run() async throws {
        // Fast path: forward to the resident `serve` daemon if it is up, so the
        // model stays warm in GPU memory instead of being cold-loaded per call.
        if !noDaemon, await trySpeakViaDaemon() {
            return
        }

        let modelId = "aufklarer/VoxCPM2-MLX-\(variant)"
        log("Loading \(modelId) ...")
        let model = try await VoxCPM2TTSModel.fromPretrained(modelId: modelId) { progress, status in
            log("  [\(Int(progress * 100))%] \(status)")
        }
        defer { model.unload() }

        var refAudio: [Float]? = nil
        if let reference {
            let url = URL(fileURLWithPath: reference)
            guard FileManager.default.fileExists(atPath: url.path) else {
                throw ValidationError("Reference file not found: \(reference)")
            }
            // VoxCPM2 ingests reference audio at 16 kHz.
            refAudio = try AudioFileLoader.load(url: url, targetSampleRate: 16000)
            log("  Reference: \(refAudio?.count ?? 0) samples")
        }

        log("Synthesizing ...")
        let audio = try await model.generateVoxCPM2(
            text: text,
            language: language,
            refAudio: refAudio,
            inferenceTimesteps: timesteps,
            cfgValue: cfg
        )
        guard !audio.isEmpty else {
            throw ValidationError("No audio was generated.")
        }
        let rate = model.sampleRate

        // Prepend leading silence so the first phoneme isn't clipped on playback.
        let padSamples = max(0, Int(prepad * Double(rate)))
        let out = padSamples > 0 ? [Float](repeating: 0, count: padSamples) + audio : audio

        if let output {
            let url = URL(fileURLWithPath: output)
            try WAVWriter.write(samples: out, sampleRate: rate, to: url)
            log("Saved \(out.count) samples (\(seconds(out.count, rate))s) to \(output)")
        } else {
            try playThroughSpeakers(samples: out, sampleRate: rate)
        }
    }

    // MARK: - Helpers

    /// Try to satisfy this request via the resident `serve` daemon. Returns true
    /// if the daemon handled it (audio played or saved); false on any failure so
    /// the caller falls back to an in-process model load. Never throws — a down
    /// or broken daemon must degrade gracefully, not abort the command.
    private func trySpeakViaDaemon() async -> Bool {
        let base = "http://127.0.0.1:\(daemonPort)"
        let session = URLSession(configuration: .ephemeral)

        // Health probe with a short timeout so a missing daemon costs ~nothing.
        guard let healthURL = URL(string: base + "/health") else { return false }
        var healthReq = URLRequest(url: healthURL)
        healthReq.timeoutInterval = 0.6
        do {
            let (_, resp) = try await session.data(for: healthReq)
            guard let http = resp as? HTTPURLResponse, http.statusCode == 200 else { return false }
        } catch {
            return false
        }

        // Build the synthesis request.
        var payload: [String: Any] = ["input": text, "timesteps": timesteps, "cfg": Double(cfg), "prepad": prepad]
        if let language { payload["language"] = language }
        if let reference { payload["voice"] = (reference as NSString).expandingTildeInPath }

        guard let speakURL = URL(string: base + "/v1/audio/speech"),
              let bodyData = try? JSONSerialization.data(withJSONObject: payload) else {
            return false
        }
        var req = URLRequest(url: speakURL)
        req.httpMethod = "POST"
        req.setValue("application/json", forHTTPHeaderField: "Content-Type")
        req.httpBody = bodyData
        req.timeoutInterval = 300

        log("Using resident daemon on port \(daemonPort) ...")
        do {
            let (wav, resp) = try await session.data(for: req)
            guard let http = resp as? HTTPURLResponse, http.statusCode == 200, !wav.isEmpty else {
                log("Daemon returned no audio; falling back to in-process load.")
                return false
            }
            if let output {
                try wav.write(to: URL(fileURLWithPath: output))
                log("Saved \(wav.count) bytes to \(output)")
            } else {
                let tmp = FileManager.default.temporaryDirectory
                    .appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
                try wav.write(to: tmp)
                defer { try? FileManager.default.removeItem(at: tmp) }
                let proc = Process()
                proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
                proc.arguments = [tmp.path]
                try proc.run()
                proc.waitUntilExit()
            }
            return true
        } catch {
            log("Daemon request failed (\(error)); falling back to in-process load.")
            return false
        }
    }

    private func playThroughSpeakers(samples: [Float], sampleRate: Int) throws {
        let tmp = FileManager.default.temporaryDirectory
            .appendingPathComponent("madcat-say-\(UUID().uuidString).wav")
        try WAVWriter.write(samples: samples, sampleRate: sampleRate, to: tmp)
        defer { try? FileManager.default.removeItem(at: tmp) }

        let proc = Process()
        proc.executableURL = URL(fileURLWithPath: "/usr/bin/afplay")
        proc.arguments = [tmp.path]
        try proc.run()
        proc.waitUntilExit()
    }

    private func seconds(_ count: Int, _ rate: Int) -> String {
        String(format: "%.2f", Double(count) / Double(rate))
    }

    /// Progress/status goes to stderr so stdout stays clean for piping.
    private func log(_ message: String) {
        FileHandle.standardError.write(Data((message + "\n").utf8))
    }
}