54 lines
1.7 KiB
Swift
54 lines
1.7 KiB
Swift
import Foundation
|
|
|
|
struct KokoroTokenizer {
|
|
let vocab: [String: Int]
|
|
|
|
init(vocab: [String: Int]) {
|
|
self.vocab = vocab
|
|
}
|
|
|
|
static func loadFromBundle() throws -> KokoroTokenizer {
|
|
guard let url = Bundle.main.url(forResource: "config", withExtension: "json", subdirectory: "Config") ??
|
|
Bundle.main.url(forResource: "config", withExtension: "json", subdirectory: nil) else {
|
|
throw CocoaError(.fileNoSuchFile)
|
|
}
|
|
|
|
let data = try Data(contentsOf: url)
|
|
let object = try JSONSerialization.jsonObject(with: data)
|
|
guard
|
|
let dict = object as? [String: Any],
|
|
let vocab = dict["vocab"] as? [String: Int]
|
|
else {
|
|
throw CocoaError(.fileReadCorruptFile)
|
|
}
|
|
|
|
return KokoroTokenizer(vocab: vocab)
|
|
}
|
|
|
|
func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) {
|
|
let cleanedScalars = phonemes.unicodeScalars.filter { $0.properties.generalCategory != .format }
|
|
let cleanedView = String.UnicodeScalarView(cleanedScalars)
|
|
var ids: [Int] = []
|
|
var unknown: [String] = []
|
|
|
|
for scalar in cleanedView {
|
|
let token = String(scalar)
|
|
if let id = vocab[token] {
|
|
ids.append(id)
|
|
} else {
|
|
unknown.append(token)
|
|
}
|
|
if ids.count >= maxTokens {
|
|
break
|
|
}
|
|
}
|
|
|
|
if !unknown.isEmpty {
|
|
print("Tokenizer: \(unknown.count) unknown tokens: \(unknown.prefix(30))")
|
|
}
|
|
print("Tokenizer: \(ids.count) ids from \(cleanedView.count) scalars, unknown: \(unknown.count)")
|
|
|
|
return (ids, unknown, cleanedView.count)
|
|
}
|
|
}
|