From 2c975007ca7facabef395c2630298f7440f12170 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20F=C3=B6rtsch?= Date: Thu, 19 Feb 2026 10:39:54 +0100 Subject: [PATCH] revert tokenizer to character-by-character, add debug output --- VorleserMac/Services/KokoroTokenizer.swift | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/VorleserMac/Services/KokoroTokenizer.swift b/VorleserMac/Services/KokoroTokenizer.swift index 4f1fa32..89f29ef 100644 --- a/VorleserMac/Services/KokoroTokenizer.swift +++ b/VorleserMac/Services/KokoroTokenizer.swift @@ -26,22 +26,28 @@ struct KokoroTokenizer { } func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) { + let cleanedScalars = phonemes.unicodeScalars.filter { $0.properties.generalCategory != .format } + let cleanedView = String.UnicodeScalarView(cleanedScalars) var ids: [Int] = [] var unknown: [String] = [] - let tokens = phonemes.split(whereSeparator: { $0.isWhitespace }) - for token in tokens { - let tokenStr = String(token) - if let id = vocab[tokenStr] { + for scalar in cleanedView { + let token = String(scalar) + if let id = vocab[token] { ids.append(id) } else { - unknown.append(tokenStr) + unknown.append(token) } if ids.count >= maxTokens { break } } - return (ids, unknown, tokens.count) + if !unknown.isEmpty { + print("Tokenizer: \(unknown.count) unknown tokens: \(unknown.prefix(30))") + } + print("Tokenizer: \(ids.count) ids from \(cleanedView.count) scalars, unknown: \(unknown.count)") + + return (ids, unknown, cleanedView.count) } }