From c81b78bea07bc1ac186f76c80983a2cd727a44a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20F=C3=B6rtsch?= Date: Fri, 13 Mar 2026 22:17:45 +0100 Subject: [PATCH] add Synthesizer wrapper around KokoroSwift with voice pack support Co-Authored-By: Claude Opus 4.6 --- VorleserKit/Package.resolved | 47 ++++++++++++++++++- VorleserKit/Package.swift | 14 +++++- .../Sources/Synthesizer/Synthesizer.swift | 47 +++++++++++++++++++ .../Synthesizer/SynthesizerError.swift | 15 ++++++ .../Sources/Synthesizer/VoicePack.swift | 21 +++++++++ 5 files changed, 142 insertions(+), 2 deletions(-) create mode 100644 VorleserKit/Sources/Synthesizer/Synthesizer.swift create mode 100644 VorleserKit/Sources/Synthesizer/SynthesizerError.swift create mode 100644 VorleserKit/Sources/Synthesizer/VoicePack.swift diff --git a/VorleserKit/Package.resolved b/VorleserKit/Package.resolved index db443d9..fccf385 100644 --- a/VorleserKit/Package.resolved +++ b/VorleserKit/Package.resolved @@ -1,6 +1,51 @@ { - "originHash" : "1bf1d418d8d58ea936176af8e96313605ea72a6fbf437f877b8e5d9a5b0d822c", + "originHash" : "6130f8afd39b4763c878d68f7965a176b7136eb36f304eaf701123ffb55cbbf7", "pins" : [ + { + "identity" : "kokoro-ios", + "kind" : "remoteSourceControl", + "location" : "https://github.com/mlalma/kokoro-ios.git", + "state" : { + "revision" : "4d6d1d8ff8cd012014180c9cd4cf0151e7682354", + "version" : "1.0.11" + } + }, + { + "identity" : "misakiswift", + "kind" : "remoteSourceControl", + "location" : "https://github.com/mlalma/MisakiSwift", + "state" : { + "revision" : "6835a1ce4a8854075c89f18ff75c74b13ef58e15", + "version" : "1.0.6" + } + }, + { + "identity" : "mlx-swift", + "kind" : "remoteSourceControl", + "location" : "https://github.com/ml-explore/mlx-swift", + "state" : { + "revision" : "f58bd2c2b3b84316da69182f436db4219aff30b9", + "version" : "0.30.2" + } + }, + { + "identity" : "mlxutilslibrary", + "kind" : "remoteSourceControl", + "location" : "https://github.com/mlalma/MLXUtilsLibrary.git", + "state" : { + "revision" : "41f6cfd5d68b65aa3c65a34efe3b71c371ed915b", + "version" : "0.0.6" + } + }, + { + "identity" : "swift-numerics", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-numerics", + "state" : { + "revision" : "0c0290ff6b24942dadb83a929ffaaa1481df04a2", + "version" : "1.1.1" + } + }, { "identity" : "swiftsoup", "kind" : "remoteSourceControl", diff --git a/VorleserKit/Package.swift b/VorleserKit/Package.swift index b18ae9f..3ccd84c 100644 --- a/VorleserKit/Package.swift +++ b/VorleserKit/Package.swift @@ -11,15 +11,27 @@ let package = Package( .library(name: "VorleserKit", targets: ["VorleserKit"]), .library(name: "BookParser", targets: ["BookParser"]), .library(name: "Storage", targets: ["Storage"]), + .library(name: "Synthesizer", targets: ["Synthesizer"]), ], dependencies: [ .package(url: "https://github.com/weichsel/ZIPFoundation.git", from: "0.9.0"), .package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.7.0"), + .package(url: "https://github.com/mlalma/kokoro-ios.git", exact: "1.0.11"), + .package(url: "https://github.com/mlalma/MLXUtilsLibrary.git", exact: "0.0.6"), + .package(url: "https://github.com/ml-explore/mlx-swift", exact: "0.30.2"), ], targets: [ .target( name: "VorleserKit", - dependencies: ["Storage"] + dependencies: ["Storage", "Synthesizer"] + ), + .target( + name: "Synthesizer", + dependencies: [ + .product(name: "KokoroSwift", package: "kokoro-ios"), + .product(name: "MLXUtilsLibrary", package: "MLXUtilsLibrary"), + .product(name: "MLX", package: "mlx-swift"), + ] ), .target( name: "Storage", diff --git a/VorleserKit/Sources/Synthesizer/Synthesizer.swift b/VorleserKit/Sources/Synthesizer/Synthesizer.swift new file mode 100644 index 0000000..f4ac49f --- /dev/null +++ b/VorleserKit/Sources/Synthesizer/Synthesizer.swift @@ -0,0 +1,47 @@ +import Foundation +import KokoroSwift +import MLX +import MLXUtilsLibrary + +public final class Synthesizer: @unchecked Sendable { + private let tts: KokoroTTS + private let voiceEmbedding: MLXArray + private let language: Language + private let voicePack: VoicePack + + public init(voice: VoicePack, modelURL: URL, voicesURL: URL) throws { + guard FileManager.default.fileExists(atPath: modelURL.path) else { + throw SynthesizerError.modelNotFound(modelURL.path) + } + guard FileManager.default.fileExists(atPath: voicesURL.path) else { + throw SynthesizerError.voicesNotFound(voicesURL.path) + } + + self.tts = KokoroTTS(modelPath: modelURL, g2p: .misaki) + + guard let voices = NpyzReader.read(fileFromPath: voicesURL), + let embedding = voices["\(voice.name).npy"] else { + throw SynthesizerError.voiceNotAvailable(voice.name) + } + + self.voiceEmbedding = embedding + self.voicePack = voice + self.language = voice.language == .enUS ? .enUS : .enGB + } + + public func synthesize(text: String) throws -> [Float] { + do { + let (samples, _) = try tts.generateAudio( + voice: voiceEmbedding, + language: language, + text: text, + speed: 1.0 + ) + return samples + } catch { + throw SynthesizerError.synthesisFailure(text, error) + } + } + + public static let sampleRate: Double = 24_000 +} diff --git a/VorleserKit/Sources/Synthesizer/SynthesizerError.swift b/VorleserKit/Sources/Synthesizer/SynthesizerError.swift new file mode 100644 index 0000000..fea17ee --- /dev/null +++ b/VorleserKit/Sources/Synthesizer/SynthesizerError.swift @@ -0,0 +1,15 @@ +public enum SynthesizerError: Error, CustomStringConvertible { + case modelNotFound(String) + case voicesNotFound(String) + case voiceNotAvailable(String) + case synthesisFailure(String, Error) + + public var description: String { + switch self { + case .modelNotFound(let path): "kokoro model not found at \(path)" + case .voicesNotFound(let path): "voices.npz not found at \(path)" + case .voiceNotAvailable(let name): "voice '\(name)' not found in voices.npz" + case .synthesisFailure(let text, let error): "synthesis failed for '\(text.prefix(50))...': \(error)" + } + } +} diff --git a/VorleserKit/Sources/Synthesizer/VoicePack.swift b/VorleserKit/Sources/Synthesizer/VoicePack.swift new file mode 100644 index 0000000..cb6e1e5 --- /dev/null +++ b/VorleserKit/Sources/Synthesizer/VoicePack.swift @@ -0,0 +1,21 @@ +import Foundation + +public struct VoicePack: Sendable, Identifiable { + public let name: String + public let displayName: String + public let language: Language + + public var id: String { name } + + public enum Language: String, Sendable { + case enUS = "en-us" + case enGB = "en-gb" + } + + /// Curated voices bundled with the app. + public static let curated: [VoicePack] = [ + VoicePack(name: "af_heart", displayName: "Heart", language: .enUS), + VoicePack(name: "af_bella", displayName: "Bella", language: .enUS), + VoicePack(name: "am_michael", displayName: "Michael", language: .enUS), + ] +}