419 lines
17 KiB
Swift
419 lines
17 KiB
Swift
import Foundation
|
|
import CoreML
|
|
|
|
final class KokoroPipeline {
|
|
enum PipelineError: Error, LocalizedError {
|
|
case modelNotFound(String)
|
|
case outputMissing(String)
|
|
|
|
var errorDescription: String? {
|
|
switch self {
|
|
case .modelNotFound(let name):
|
|
return "Missing model: \(name)"
|
|
case .outputMissing(let name):
|
|
return "Missing output: \(name)"
|
|
}
|
|
}
|
|
}
|
|
|
|
private let phonemizer = EspeakPhonemizer()
|
|
private let tokenizer: KokoroTokenizer
|
|
private let durationModel: MLModel
|
|
private let decoderModel: MLModel
|
|
private let f0nModel: MLModel?
|
|
private let bucketFrames: Int
|
|
private let bucketF0Frames: Int
|
|
private let playbackSpeed: Float = 0.7
|
|
private let f0Scale: Float = 1.25
|
|
private let outputGain: Float = 2.0
|
|
private let voicePack: [Float]
|
|
private let voicePackCount: Int
|
|
|
|
init() throws {
|
|
tokenizer = try KokoroTokenizer.loadFromBundle()
|
|
durationModel = try KokoroPipeline.loadModel(named: "kokoro_duration")
|
|
// Use 3s bucket for debugging signal quality.
|
|
decoderModel = try KokoroPipeline.loadModel(named: "kokoro_decoder_only_3s")
|
|
f0nModel = try? KokoroPipeline.loadModel(named: "kokoro_f0n_3s")
|
|
bucketFrames = 120
|
|
bucketF0Frames = 240
|
|
voicePack = try KokoroPipeline.loadVoicePack()
|
|
voicePackCount = max(1, voicePack.count / 256)
|
|
}
|
|
|
|
func synthesize(text: String) throws -> [Float] {
|
|
let (phonemes, tokenIdsAll, unknown, phonemeScalarCount) = try prepareTokens(from: text, maxTokens: 512)
|
|
if !unknown.isEmpty {
|
|
print("KokoroTokenizer unknown tokens: \(unknown.prefix(20))")
|
|
}
|
|
print("Phonemes (preview): \(phonemes.prefix(200))")
|
|
return try synthesizeTokenIds(tokenIdsAll, phonemeScalarCount: phonemeScalarCount)
|
|
}
|
|
|
|
func synthesizeLong(text: String) throws -> [Float] {
|
|
let (phonemes, tokenIdsAll, unknown, _) = try prepareTokens(from: text, maxTokens: 4096)
|
|
if !unknown.isEmpty {
|
|
print("KokoroTokenizer unknown tokens: \(unknown.prefix(20))")
|
|
}
|
|
print("Phonemes (preview): \(phonemes.prefix(200))")
|
|
|
|
let chunkSize = 70
|
|
var chunks: [[Int]] = []
|
|
var idx = 0
|
|
while idx < tokenIdsAll.count {
|
|
let end = min(tokenIdsAll.count, idx + chunkSize)
|
|
chunks.append(Array(tokenIdsAll[idx..<end]))
|
|
idx = end
|
|
}
|
|
|
|
var audioChunks: [[Float]] = []
|
|
for chunk in chunks {
|
|
let samples = try synthesizeTokenIds(chunk, phonemeScalarCount: chunk.count)
|
|
audioChunks.append(samples)
|
|
}
|
|
|
|
return concatenateWithCrossfade(chunks: audioChunks, crossfade: 400)
|
|
}
|
|
|
|
private func prepareTokens(from text: String, maxTokens: Int) throws -> (String, [Int], [String], Int) {
|
|
let phonemes: String
|
|
if text.hasPrefix("[PHONEMES]") {
|
|
phonemes = text.replacingOccurrences(of: "[PHONEMES]", with: "").trimmingCharacters(in: .whitespacesAndNewlines)
|
|
} else {
|
|
phonemes = try phonemizer.phonemize(text)
|
|
}
|
|
let (tokenIdsAll, unknown, phonemeScalarCount) = tokenizer.tokenize(phonemes, maxTokens: maxTokens)
|
|
if tokenIdsAll.count > maxTokens {
|
|
print("Token count truncated from \(tokenIdsAll.count) to \(maxTokens) for bucket")
|
|
}
|
|
return (phonemes, tokenIdsAll, unknown, phonemeScalarCount)
|
|
}
|
|
|
|
private func synthesizeTokenIds(_ tokenIdsAll: [Int], phonemeScalarCount: Int) throws -> [Float] {
|
|
let tokenIds = [0] + tokenIdsAll + [0]
|
|
print("Token count: \(tokenIds.count) (input: \(tokenIdsAll.count) + 2 for start/end)")
|
|
|
|
let inputIds = try MLMultiArray(shape: [1, 128], dataType: .int32)
|
|
let attention = try MLMultiArray(shape: [1, 128], dataType: .int32)
|
|
for i in 0..<128 {
|
|
inputIds[i] = 0
|
|
attention[i] = 0
|
|
}
|
|
for (idx, id) in tokenIds.prefix(128).enumerated() {
|
|
inputIds[idx] = NSNumber(value: id)
|
|
attention[idx] = 1
|
|
}
|
|
|
|
let refS = try MLMultiArray(shape: [1, 256], dataType: .float32)
|
|
let voiceIndex = max(0, min(voicePackCount - 1, max(1, phonemeScalarCount) - 1))
|
|
let voiceOffset = voiceIndex * 256
|
|
for i in 0..<256 {
|
|
refS[i] = NSNumber(value: voicePack[voiceOffset + i])
|
|
}
|
|
|
|
let speed = try MLMultiArray(shape: [1], dataType: .float32)
|
|
speed[0] = NSNumber(value: playbackSpeed)
|
|
|
|
let durationProvider = try MLDictionaryFeatureProvider(dictionary: [
|
|
"input_ids": MLFeatureValue(multiArray: inputIds),
|
|
"attention_mask": MLFeatureValue(multiArray: attention),
|
|
"ref_s": MLFeatureValue(multiArray: refS),
|
|
"speed": MLFeatureValue(multiArray: speed)
|
|
])
|
|
let durationOutput = try durationModel.prediction(from: durationProvider)
|
|
|
|
guard let d = durationOutput.featureValue(for: "d")?.multiArrayValue else {
|
|
throw PipelineError.outputMissing("d")
|
|
}
|
|
guard let tEn = durationOutput.featureValue(for: "t_en")?.multiArrayValue else {
|
|
throw PipelineError.outputMissing("t_en")
|
|
}
|
|
guard let s = durationOutput.featureValue(for: "s")?.multiArrayValue else {
|
|
throw PipelineError.outputMissing("s")
|
|
}
|
|
guard let predDur = durationOutput.featureValue(for: "pred_dur")?.multiArrayValue else {
|
|
throw PipelineError.outputMissing("pred_dur")
|
|
}
|
|
let refSOut = durationOutput.featureValue(for: "ref_s_out")?.multiArrayValue ?? refS
|
|
|
|
let tokenCount = min(tokenIds.count, 128)
|
|
let durations = makeDurations(predDur: predDur, tokenCount: tokenCount, targetFrames: bucketFrames)
|
|
print("Durations sum: \(durations.reduce(0, +)) for tokenCount \(tokenCount)")
|
|
let predAln = buildAlignment(tokenCount: tokenCount, targetFrames: bucketFrames, durations: durations)
|
|
let asr = buildAligned(tensor: tEn, channels: 512, tokenCount: tokenCount, alignment: predAln, tokenAxis: 2, channelAxis: 1, frameCount: bucketFrames)
|
|
let en = buildAligned(tensor: d, channels: 640, tokenCount: tokenCount, alignment: predAln, tokenAxis: 1, channelAxis: 2, frameCount: bucketFrames)
|
|
|
|
let (f0, n) = try predictF0N(en: en, s: s)
|
|
|
|
print("d min/max: \(minMax(d))")
|
|
print("d shape/strides: \(d.shape) / \(d.strides)")
|
|
print("t_en min/max: \(minMax(tEn))")
|
|
print("t_en shape/strides: \(tEn.shape) / \(tEn.strides)")
|
|
print("s min/max: \(minMax(s))")
|
|
print("s shape/strides: \(s.shape) / \(s.strides)")
|
|
print("asr min/max: \(minMax(asr))")
|
|
print("asr shape/strides: \(asr.shape) / \(asr.strides)")
|
|
print("en min/max: \(minMax(en))")
|
|
print("en shape/strides: \(en.shape) / \(en.strides)")
|
|
print("F0 min/max: \(minMax(f0))")
|
|
print("F0 shape/strides: \(f0.shape) / \(f0.strides)")
|
|
print("N min/max: \(minMax(n))")
|
|
print("N shape/strides: \(n.shape) / \(n.strides)")
|
|
|
|
let decoderProvider = try MLDictionaryFeatureProvider(dictionary: [
|
|
"asr": MLFeatureValue(multiArray: asr),
|
|
"F0_pred": MLFeatureValue(multiArray: f0),
|
|
"N_pred": MLFeatureValue(multiArray: n),
|
|
"ref_s": MLFeatureValue(multiArray: refSOut)
|
|
])
|
|
let decoderOutput = try decoderModel.prediction(from: decoderProvider)
|
|
|
|
guard let waveform = decoderOutput.featureValue(for: "waveform")?.multiArrayValue else {
|
|
throw PipelineError.outputMissing("waveform")
|
|
}
|
|
print("waveform shape/strides: \(waveform.shape) / \(waveform.strides) count=\(waveform.count)")
|
|
|
|
var samples = waveformToArray(waveform)
|
|
applyGain(&samples, gain: outputGain)
|
|
if let min = samples.min(), let max = samples.max() {
|
|
print("Waveform min/max: \(min) / \(max)")
|
|
}
|
|
return samples
|
|
}
|
|
|
|
private func concatenateWithCrossfade(chunks: [[Float]], crossfade: Int) -> [Float] {
|
|
guard var output = chunks.first else { return [] }
|
|
for chunk in chunks.dropFirst() {
|
|
let fadeCount = min(crossfade, min(output.count, chunk.count))
|
|
let start = output.count - fadeCount
|
|
for i in 0..<fadeCount {
|
|
let t = Float(i) / Float(max(1, fadeCount - 1))
|
|
let a = 1 - t
|
|
let b = t
|
|
output[start + i] = output[start + i] * a + chunk[i] * b
|
|
}
|
|
if chunk.count > fadeCount {
|
|
output.append(contentsOf: chunk[fadeCount...])
|
|
}
|
|
}
|
|
return output
|
|
}
|
|
|
|
private static func loadModel(named name: String) throws -> MLModel {
|
|
if let url = Bundle.main.url(forResource: name, withExtension: "mlmodelc", subdirectory: "Models") ??
|
|
Bundle.main.url(forResource: name, withExtension: "mlmodelc", subdirectory: nil) {
|
|
return try MLModel(contentsOf: url)
|
|
}
|
|
throw PipelineError.modelNotFound(name)
|
|
}
|
|
|
|
private func makeDurations(predDur: MLMultiArray, tokenCount: Int, targetFrames: Int) -> [Int] {
|
|
var raw: [Int] = []
|
|
raw.reserveCapacity(tokenCount)
|
|
for i in 0..<tokenCount {
|
|
let value = predDur[i].doubleValue
|
|
let safe = value.isFinite ? value : 1.0
|
|
let rounded = max(1, Int(safe.rounded()))
|
|
raw.append(rounded)
|
|
}
|
|
let sum = max(1, raw.reduce(0, +))
|
|
let scale = Double(targetFrames) / Double(sum)
|
|
var scaled = raw.map { max(1, Int((Double($0) * scale).rounded())) }
|
|
var current = scaled.reduce(0, +)
|
|
if current > targetFrames {
|
|
var i = scaled.count - 1
|
|
while current > targetFrames && i >= 0 {
|
|
if scaled[i] > 1 {
|
|
scaled[i] -= 1
|
|
current -= 1
|
|
} else {
|
|
i -= 1
|
|
}
|
|
}
|
|
} else if current < targetFrames, let last = scaled.indices.last {
|
|
scaled[last] += (targetFrames - current)
|
|
}
|
|
return scaled
|
|
}
|
|
|
|
private func buildAlignment(tokenCount: Int, targetFrames: Int, durations: [Int]) -> [[Float]] {
|
|
var alignment = Array(repeating: Array(repeating: Float(0), count: targetFrames), count: tokenCount)
|
|
var cursor = 0
|
|
for i in 0..<tokenCount {
|
|
let dur = durations[i]
|
|
if dur <= 0 { continue }
|
|
let end = min(targetFrames, cursor + dur)
|
|
if cursor >= targetFrames { break }
|
|
for f in cursor..<end {
|
|
alignment[i][f] = 1.0
|
|
}
|
|
cursor = end
|
|
}
|
|
return alignment
|
|
}
|
|
|
|
private func buildAligned(tensor: MLMultiArray, channels: Int, tokenCount: Int, alignment: [[Float]], tokenAxis: Int, channelAxis: Int, frameCount: Int) -> MLMultiArray {
|
|
let output = try! MLMultiArray(shape: [1, NSNumber(value: channels), NSNumber(value: frameCount)], dataType: .float32)
|
|
let shape = tensor.shape.map { $0.intValue }
|
|
let strides = tensor.strides.map { $0.intValue }
|
|
let availableTokens = min(tokenCount, shape[tokenAxis])
|
|
let availableChannels = min(channels, shape[channelAxis])
|
|
|
|
func offset(channel: Int, token: Int) -> Int {
|
|
// assumes batch dimension = 1 at axis 0
|
|
if strides.count >= 3 {
|
|
var off = 0
|
|
if channelAxis == 1 && tokenAxis == 2 {
|
|
off = channel * strides[1] + token * strides[2]
|
|
} else if tokenAxis == 1 && channelAxis == 2 {
|
|
off = token * strides[1] + channel * strides[2]
|
|
} else {
|
|
// fallback for unexpected layouts
|
|
off = channel * strides[max(channelAxis, 1)] + token * strides[max(tokenAxis, 1)]
|
|
}
|
|
return off
|
|
}
|
|
if strides.count == 2 {
|
|
return channel * strides[0] + token * strides[1]
|
|
}
|
|
return channel * (shape.last ?? 0) + token
|
|
}
|
|
|
|
// First pass: compute weighted sum
|
|
for h in 0..<availableChannels {
|
|
for f in 0..<frameCount {
|
|
var sum: Float = 0
|
|
for t in 0..<availableTokens {
|
|
let weight = alignment[t][f]
|
|
if weight == 0 { continue }
|
|
let idx = offset(channel: h, token: t)
|
|
if idx < 0 || idx >= tensor.count { continue }
|
|
let value = tensor[idx].floatValue
|
|
if !value.isFinite { continue }
|
|
sum += value * weight
|
|
}
|
|
let outIndex = h * frameCount + f
|
|
output[outIndex] = NSNumber(value: sum)
|
|
}
|
|
}
|
|
|
|
// Second pass: per-channel min-max normalization
|
|
var channelMin = [Float](repeating: Float.greatestFiniteMagnitude, count: availableChannels)
|
|
var channelMax = [Float](repeating: -Float.greatestFiniteMagnitude, count: availableChannels)
|
|
|
|
for h in 0..<availableChannels {
|
|
for f in 0..<frameCount {
|
|
let idx = h * frameCount + f
|
|
let value = output[idx].floatValue
|
|
if value.isFinite {
|
|
if value < channelMin[h] { channelMin[h] = value }
|
|
if value > channelMax[h] { channelMax[h] = value }
|
|
}
|
|
}
|
|
}
|
|
|
|
for h in 0..<availableChannels {
|
|
let range = channelMax[h] - channelMin[h]
|
|
if range > 1e-6 {
|
|
for f in 0..<frameCount {
|
|
let idx = h * frameCount + f
|
|
let value = output[idx].floatValue
|
|
if value.isFinite {
|
|
let normalized = (value - channelMin[h]) / range * 2.0 - 1.0
|
|
output[idx] = NSNumber(value: normalized)
|
|
} else {
|
|
output[idx] = NSNumber(value: Float(0))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return output
|
|
}
|
|
|
|
private func predictF0N(en: MLMultiArray, s: MLMultiArray) throws -> (MLMultiArray, MLMultiArray) {
|
|
guard let f0nModel else {
|
|
let f0 = try MLMultiArray(shape: [1, NSNumber(value: bucketF0Frames)], dataType: .float32)
|
|
let n = try MLMultiArray(shape: [1, NSNumber(value: bucketF0Frames)], dataType: .float32)
|
|
for i in 0..<bucketF0Frames { f0[i] = 0; n[i] = 0 }
|
|
return (f0, n)
|
|
}
|
|
|
|
let provider = try MLDictionaryFeatureProvider(dictionary: [
|
|
"en": MLFeatureValue(multiArray: en),
|
|
"s": MLFeatureValue(multiArray: s)
|
|
])
|
|
let output = try f0nModel.prediction(from: provider)
|
|
guard let f0 = output.featureValue(for: "F0_pred")?.multiArrayValue else {
|
|
throw PipelineError.outputMissing("F0_pred")
|
|
}
|
|
guard let n = output.featureValue(for: "N_pred")?.multiArrayValue else {
|
|
throw PipelineError.outputMissing("N_pred")
|
|
}
|
|
sanitize(array: f0)
|
|
sanitize(array: n)
|
|
if f0Scale != 1.0 {
|
|
for i in 0..<f0.count {
|
|
let v = f0[i].floatValue * f0Scale
|
|
f0[i] = NSNumber(value: v)
|
|
}
|
|
}
|
|
return (f0, n)
|
|
}
|
|
|
|
private func sanitize(array: MLMultiArray) {
|
|
for i in 0..<array.count {
|
|
let value = array[i].floatValue
|
|
if !value.isFinite {
|
|
array[i] = 0
|
|
}
|
|
}
|
|
}
|
|
|
|
private func minMax(_ array: MLMultiArray) -> String {
|
|
var minVal = Float.greatestFiniteMagnitude
|
|
var maxVal = -Float.greatestFiniteMagnitude
|
|
for i in 0..<array.count {
|
|
let v = array[i].floatValue
|
|
if !v.isFinite { continue }
|
|
if v < minVal { minVal = v }
|
|
if v > maxVal { maxVal = v }
|
|
}
|
|
if minVal == Float.greatestFiniteMagnitude { return "n/a" }
|
|
return "\(minVal) / \(maxVal)"
|
|
}
|
|
|
|
private static func loadVoicePack() throws -> [Float] {
|
|
guard let url = Bundle.main.url(forResource: "am_michael", withExtension: "f32", subdirectory: "Voices") ??
|
|
Bundle.main.url(forResource: "am_michael", withExtension: "f32", subdirectory: nil) else {
|
|
throw CocoaError(.fileNoSuchFile)
|
|
}
|
|
let data = try Data(contentsOf: url)
|
|
let count = data.count / MemoryLayout<Float>.size
|
|
return data.withUnsafeBytes { rawPtr in
|
|
let buffer = rawPtr.bindMemory(to: Float.self)
|
|
return Array(buffer.prefix(count))
|
|
}
|
|
}
|
|
|
|
private func waveformToArray(_ array: MLMultiArray) -> [Float] {
|
|
let count = array.count
|
|
var samples = [Float](repeating: 0, count: count)
|
|
for i in 0..<count {
|
|
let value = array[i].floatValue
|
|
samples[i] = value.isFinite ? value : 0
|
|
}
|
|
return samples
|
|
}
|
|
|
|
private func applyGain(_ samples: inout [Float], gain: Float) {
|
|
guard gain != 1 else { return }
|
|
for i in 0..<samples.count {
|
|
var v = samples[i] * gain
|
|
if v > 1 { v = 1 }
|
|
if v < -1 { v = -1 }
|
|
samples[i] = v
|
|
}
|
|
}
|
|
}
|