Files
vorleser/VorleserMac/Services/KokoroPipeline.swift

419 lines
17 KiB
Swift

import Foundation
import CoreML
final class KokoroPipeline {
enum PipelineError: Error, LocalizedError {
case modelNotFound(String)
case outputMissing(String)
var errorDescription: String? {
switch self {
case .modelNotFound(let name):
return "Missing model: \(name)"
case .outputMissing(let name):
return "Missing output: \(name)"
}
}
}
private let phonemizer = EspeakPhonemizer()
private let tokenizer: KokoroTokenizer
private let durationModel: MLModel
private let decoderModel: MLModel
private let f0nModel: MLModel?
private let bucketFrames: Int
private let bucketF0Frames: Int
private let playbackSpeed: Float = 0.7
private let f0Scale: Float = 1.25
private let outputGain: Float = 2.0
private let voicePack: [Float]
private let voicePackCount: Int
init() throws {
tokenizer = try KokoroTokenizer.loadFromBundle()
durationModel = try KokoroPipeline.loadModel(named: "kokoro_duration")
// Use 3s bucket for debugging signal quality.
decoderModel = try KokoroPipeline.loadModel(named: "kokoro_decoder_only_3s")
f0nModel = try? KokoroPipeline.loadModel(named: "kokoro_f0n_3s")
bucketFrames = 120
bucketF0Frames = 240
voicePack = try KokoroPipeline.loadVoicePack()
voicePackCount = max(1, voicePack.count / 256)
}
func synthesize(text: String) throws -> [Float] {
let (phonemes, tokenIdsAll, unknown, phonemeScalarCount) = try prepareTokens(from: text, maxTokens: 512)
if !unknown.isEmpty {
print("KokoroTokenizer unknown tokens: \(unknown.prefix(20))")
}
print("Phonemes (preview): \(phonemes.prefix(200))")
return try synthesizeTokenIds(tokenIdsAll, phonemeScalarCount: phonemeScalarCount)
}
func synthesizeLong(text: String) throws -> [Float] {
let (phonemes, tokenIdsAll, unknown, _) = try prepareTokens(from: text, maxTokens: 4096)
if !unknown.isEmpty {
print("KokoroTokenizer unknown tokens: \(unknown.prefix(20))")
}
print("Phonemes (preview): \(phonemes.prefix(200))")
let chunkSize = 70
var chunks: [[Int]] = []
var idx = 0
while idx < tokenIdsAll.count {
let end = min(tokenIdsAll.count, idx + chunkSize)
chunks.append(Array(tokenIdsAll[idx..<end]))
idx = end
}
var audioChunks: [[Float]] = []
for chunk in chunks {
let samples = try synthesizeTokenIds(chunk, phonemeScalarCount: chunk.count)
audioChunks.append(samples)
}
return concatenateWithCrossfade(chunks: audioChunks, crossfade: 400)
}
private func prepareTokens(from text: String, maxTokens: Int) throws -> (String, [Int], [String], Int) {
let phonemes: String
if text.hasPrefix("[PHONEMES]") {
phonemes = text.replacingOccurrences(of: "[PHONEMES]", with: "").trimmingCharacters(in: .whitespacesAndNewlines)
} else {
phonemes = try phonemizer.phonemize(text)
}
let (tokenIdsAll, unknown, phonemeScalarCount) = tokenizer.tokenize(phonemes, maxTokens: maxTokens)
if tokenIdsAll.count > maxTokens {
print("Token count truncated from \(tokenIdsAll.count) to \(maxTokens) for bucket")
}
return (phonemes, tokenIdsAll, unknown, phonemeScalarCount)
}
private func synthesizeTokenIds(_ tokenIdsAll: [Int], phonemeScalarCount: Int) throws -> [Float] {
let tokenIds = [0] + tokenIdsAll + [0]
print("Token count: \(tokenIds.count) (input: \(tokenIdsAll.count) + 2 for start/end)")
let inputIds = try MLMultiArray(shape: [1, 128], dataType: .int32)
let attention = try MLMultiArray(shape: [1, 128], dataType: .int32)
for i in 0..<128 {
inputIds[i] = 0
attention[i] = 0
}
for (idx, id) in tokenIds.prefix(128).enumerated() {
inputIds[idx] = NSNumber(value: id)
attention[idx] = 1
}
let refS = try MLMultiArray(shape: [1, 256], dataType: .float32)
let voiceIndex = max(0, min(voicePackCount - 1, max(1, phonemeScalarCount) - 1))
let voiceOffset = voiceIndex * 256
for i in 0..<256 {
refS[i] = NSNumber(value: voicePack[voiceOffset + i])
}
let speed = try MLMultiArray(shape: [1], dataType: .float32)
speed[0] = NSNumber(value: playbackSpeed)
let durationProvider = try MLDictionaryFeatureProvider(dictionary: [
"input_ids": MLFeatureValue(multiArray: inputIds),
"attention_mask": MLFeatureValue(multiArray: attention),
"ref_s": MLFeatureValue(multiArray: refS),
"speed": MLFeatureValue(multiArray: speed)
])
let durationOutput = try durationModel.prediction(from: durationProvider)
guard let d = durationOutput.featureValue(for: "d")?.multiArrayValue else {
throw PipelineError.outputMissing("d")
}
guard let tEn = durationOutput.featureValue(for: "t_en")?.multiArrayValue else {
throw PipelineError.outputMissing("t_en")
}
guard let s = durationOutput.featureValue(for: "s")?.multiArrayValue else {
throw PipelineError.outputMissing("s")
}
guard let predDur = durationOutput.featureValue(for: "pred_dur")?.multiArrayValue else {
throw PipelineError.outputMissing("pred_dur")
}
let refSOut = durationOutput.featureValue(for: "ref_s_out")?.multiArrayValue ?? refS
let tokenCount = min(tokenIds.count, 128)
let durations = makeDurations(predDur: predDur, tokenCount: tokenCount, targetFrames: bucketFrames)
print("Durations sum: \(durations.reduce(0, +)) for tokenCount \(tokenCount)")
let predAln = buildAlignment(tokenCount: tokenCount, targetFrames: bucketFrames, durations: durations)
let asr = buildAligned(tensor: tEn, channels: 512, tokenCount: tokenCount, alignment: predAln, tokenAxis: 2, channelAxis: 1, frameCount: bucketFrames)
let en = buildAligned(tensor: d, channels: 640, tokenCount: tokenCount, alignment: predAln, tokenAxis: 1, channelAxis: 2, frameCount: bucketFrames)
let (f0, n) = try predictF0N(en: en, s: s)
print("d min/max: \(minMax(d))")
print("d shape/strides: \(d.shape) / \(d.strides)")
print("t_en min/max: \(minMax(tEn))")
print("t_en shape/strides: \(tEn.shape) / \(tEn.strides)")
print("s min/max: \(minMax(s))")
print("s shape/strides: \(s.shape) / \(s.strides)")
print("asr min/max: \(minMax(asr))")
print("asr shape/strides: \(asr.shape) / \(asr.strides)")
print("en min/max: \(minMax(en))")
print("en shape/strides: \(en.shape) / \(en.strides)")
print("F0 min/max: \(minMax(f0))")
print("F0 shape/strides: \(f0.shape) / \(f0.strides)")
print("N min/max: \(minMax(n))")
print("N shape/strides: \(n.shape) / \(n.strides)")
let decoderProvider = try MLDictionaryFeatureProvider(dictionary: [
"asr": MLFeatureValue(multiArray: asr),
"F0_pred": MLFeatureValue(multiArray: f0),
"N_pred": MLFeatureValue(multiArray: n),
"ref_s": MLFeatureValue(multiArray: refSOut)
])
let decoderOutput = try decoderModel.prediction(from: decoderProvider)
guard let waveform = decoderOutput.featureValue(for: "waveform")?.multiArrayValue else {
throw PipelineError.outputMissing("waveform")
}
print("waveform shape/strides: \(waveform.shape) / \(waveform.strides) count=\(waveform.count)")
var samples = waveformToArray(waveform)
applyGain(&samples, gain: outputGain)
if let min = samples.min(), let max = samples.max() {
print("Waveform min/max: \(min) / \(max)")
}
return samples
}
private func concatenateWithCrossfade(chunks: [[Float]], crossfade: Int) -> [Float] {
guard var output = chunks.first else { return [] }
for chunk in chunks.dropFirst() {
let fadeCount = min(crossfade, min(output.count, chunk.count))
let start = output.count - fadeCount
for i in 0..<fadeCount {
let t = Float(i) / Float(max(1, fadeCount - 1))
let a = 1 - t
let b = t
output[start + i] = output[start + i] * a + chunk[i] * b
}
if chunk.count > fadeCount {
output.append(contentsOf: chunk[fadeCount...])
}
}
return output
}
private static func loadModel(named name: String) throws -> MLModel {
if let url = Bundle.main.url(forResource: name, withExtension: "mlmodelc", subdirectory: "Models") ??
Bundle.main.url(forResource: name, withExtension: "mlmodelc", subdirectory: nil) {
return try MLModel(contentsOf: url)
}
throw PipelineError.modelNotFound(name)
}
private func makeDurations(predDur: MLMultiArray, tokenCount: Int, targetFrames: Int) -> [Int] {
var raw: [Int] = []
raw.reserveCapacity(tokenCount)
for i in 0..<tokenCount {
let value = predDur[i].doubleValue
let safe = value.isFinite ? value : 1.0
let rounded = max(1, Int(safe.rounded()))
raw.append(rounded)
}
let sum = max(1, raw.reduce(0, +))
let scale = Double(targetFrames) / Double(sum)
var scaled = raw.map { max(1, Int((Double($0) * scale).rounded())) }
var current = scaled.reduce(0, +)
if current > targetFrames {
var i = scaled.count - 1
while current > targetFrames && i >= 0 {
if scaled[i] > 1 {
scaled[i] -= 1
current -= 1
} else {
i -= 1
}
}
} else if current < targetFrames, let last = scaled.indices.last {
scaled[last] += (targetFrames - current)
}
return scaled
}
private func buildAlignment(tokenCount: Int, targetFrames: Int, durations: [Int]) -> [[Float]] {
var alignment = Array(repeating: Array(repeating: Float(0), count: targetFrames), count: tokenCount)
var cursor = 0
for i in 0..<tokenCount {
let dur = durations[i]
if dur <= 0 { continue }
let end = min(targetFrames, cursor + dur)
if cursor >= targetFrames { break }
for f in cursor..<end {
alignment[i][f] = 1.0
}
cursor = end
}
return alignment
}
private func buildAligned(tensor: MLMultiArray, channels: Int, tokenCount: Int, alignment: [[Float]], tokenAxis: Int, channelAxis: Int, frameCount: Int) -> MLMultiArray {
let output = try! MLMultiArray(shape: [1, NSNumber(value: channels), NSNumber(value: frameCount)], dataType: .float32)
let shape = tensor.shape.map { $0.intValue }
let strides = tensor.strides.map { $0.intValue }
let availableTokens = min(tokenCount, shape[tokenAxis])
let availableChannels = min(channels, shape[channelAxis])
func offset(channel: Int, token: Int) -> Int {
// assumes batch dimension = 1 at axis 0
if strides.count >= 3 {
var off = 0
if channelAxis == 1 && tokenAxis == 2 {
off = channel * strides[1] + token * strides[2]
} else if tokenAxis == 1 && channelAxis == 2 {
off = token * strides[1] + channel * strides[2]
} else {
// fallback for unexpected layouts
off = channel * strides[max(channelAxis, 1)] + token * strides[max(tokenAxis, 1)]
}
return off
}
if strides.count == 2 {
return channel * strides[0] + token * strides[1]
}
return channel * (shape.last ?? 0) + token
}
// First pass: compute weighted sum
for h in 0..<availableChannels {
for f in 0..<frameCount {
var sum: Float = 0
for t in 0..<availableTokens {
let weight = alignment[t][f]
if weight == 0 { continue }
let idx = offset(channel: h, token: t)
if idx < 0 || idx >= tensor.count { continue }
let value = tensor[idx].floatValue
if !value.isFinite { continue }
sum += value * weight
}
let outIndex = h * frameCount + f
output[outIndex] = NSNumber(value: sum)
}
}
// Second pass: per-channel min-max normalization
var channelMin = [Float](repeating: Float.greatestFiniteMagnitude, count: availableChannels)
var channelMax = [Float](repeating: -Float.greatestFiniteMagnitude, count: availableChannels)
for h in 0..<availableChannels {
for f in 0..<frameCount {
let idx = h * frameCount + f
let value = output[idx].floatValue
if value.isFinite {
if value < channelMin[h] { channelMin[h] = value }
if value > channelMax[h] { channelMax[h] = value }
}
}
}
for h in 0..<availableChannels {
let range = channelMax[h] - channelMin[h]
if range > 1e-6 {
for f in 0..<frameCount {
let idx = h * frameCount + f
let value = output[idx].floatValue
if value.isFinite {
let normalized = (value - channelMin[h]) / range * 2.0 - 1.0
output[idx] = NSNumber(value: normalized)
} else {
output[idx] = NSNumber(value: Float(0))
}
}
}
}
return output
}
private func predictF0N(en: MLMultiArray, s: MLMultiArray) throws -> (MLMultiArray, MLMultiArray) {
guard let f0nModel else {
let f0 = try MLMultiArray(shape: [1, NSNumber(value: bucketF0Frames)], dataType: .float32)
let n = try MLMultiArray(shape: [1, NSNumber(value: bucketF0Frames)], dataType: .float32)
for i in 0..<bucketF0Frames { f0[i] = 0; n[i] = 0 }
return (f0, n)
}
let provider = try MLDictionaryFeatureProvider(dictionary: [
"en": MLFeatureValue(multiArray: en),
"s": MLFeatureValue(multiArray: s)
])
let output = try f0nModel.prediction(from: provider)
guard let f0 = output.featureValue(for: "F0_pred")?.multiArrayValue else {
throw PipelineError.outputMissing("F0_pred")
}
guard let n = output.featureValue(for: "N_pred")?.multiArrayValue else {
throw PipelineError.outputMissing("N_pred")
}
sanitize(array: f0)
sanitize(array: n)
if f0Scale != 1.0 {
for i in 0..<f0.count {
let v = f0[i].floatValue * f0Scale
f0[i] = NSNumber(value: v)
}
}
return (f0, n)
}
private func sanitize(array: MLMultiArray) {
for i in 0..<array.count {
let value = array[i].floatValue
if !value.isFinite {
array[i] = 0
}
}
}
private func minMax(_ array: MLMultiArray) -> String {
var minVal = Float.greatestFiniteMagnitude
var maxVal = -Float.greatestFiniteMagnitude
for i in 0..<array.count {
let v = array[i].floatValue
if !v.isFinite { continue }
if v < minVal { minVal = v }
if v > maxVal { maxVal = v }
}
if minVal == Float.greatestFiniteMagnitude { return "n/a" }
return "\(minVal) / \(maxVal)"
}
private static func loadVoicePack() throws -> [Float] {
guard let url = Bundle.main.url(forResource: "am_michael", withExtension: "f32", subdirectory: "Voices") ??
Bundle.main.url(forResource: "am_michael", withExtension: "f32", subdirectory: nil) else {
throw CocoaError(.fileNoSuchFile)
}
let data = try Data(contentsOf: url)
let count = data.count / MemoryLayout<Float>.size
return data.withUnsafeBytes { rawPtr in
let buffer = rawPtr.bindMemory(to: Float.self)
return Array(buffer.prefix(count))
}
}
private func waveformToArray(_ array: MLMultiArray) -> [Float] {
let count = array.count
var samples = [Float](repeating: 0, count: count)
for i in 0..<count {
let value = array[i].floatValue
samples[i] = value.isFinite ? value : 0
}
return samples
}
private func applyGain(_ samples: inout [Float], gain: Float) {
guard gain != 1 else { return }
for i in 0..<samples.count {
var v = samples[i] * gain
if v > 1 { v = 1 }
if v < -1 { v = -1 }
samples[i] = v
}
}
}