revert tokenizer to character-by-character, add debug output

This commit is contained in:
2026-02-19 10:39:54 +01:00
parent 4c40e93542
commit 2c975007ca

View File

@@ -26,22 +26,28 @@ struct KokoroTokenizer {
} }
func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) { func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) {
let cleanedScalars = phonemes.unicodeScalars.filter { $0.properties.generalCategory != .format }
let cleanedView = String.UnicodeScalarView(cleanedScalars)
var ids: [Int] = [] var ids: [Int] = []
var unknown: [String] = [] var unknown: [String] = []
let tokens = phonemes.split(whereSeparator: { $0.isWhitespace }) for scalar in cleanedView {
for token in tokens { let token = String(scalar)
let tokenStr = String(token) if let id = vocab[token] {
if let id = vocab[tokenStr] {
ids.append(id) ids.append(id)
} else { } else {
unknown.append(tokenStr) unknown.append(token)
} }
if ids.count >= maxTokens { if ids.count >= maxTokens {
break break
} }
} }
return (ids, unknown, tokens.count) if !unknown.isEmpty {
print("Tokenizer: \(unknown.count) unknown tokens: \(unknown.prefix(30))")
}
print("Tokenizer: \(ids.count) ids from \(cleanedView.count) scalars, unknown: \(unknown.count)")
return (ids, unknown, cleanedView.count)
} }
} }