fix tokenizer to split on whitespace for proper phoneme tokens

This commit is contained in:
2026-02-19 10:33:46 +01:00
parent 7098f318e9
commit 217eb78e87

View File

@@ -26,23 +26,22 @@ struct KokoroTokenizer {
}
func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) {
let cleanedScalars = phonemes.unicodeScalars.filter { $0.properties.generalCategory != .format }
let cleanedView = String.UnicodeScalarView(cleanedScalars)
var ids: [Int] = []
var unknown: [String] = []
for scalar in cleanedView {
let token = String(scalar)
if let id = vocab[token] {
let tokens = phonemes.split(whereSeparator: { $0.isWhitespace })
for token in tokens {
let tokenStr = String(token)
if let id = vocab[tokenStr] {
ids.append(id)
} else {
unknown.append(token)
unknown.append(tokenStr)
}
if ids.count >= maxTokens {
break
}
}
return (ids, unknown, cleanedView.count)
return (ids, unknown, tokens.count)
}
}