diff --git a/VorleserMac/Services/KokoroTokenizer.swift b/VorleserMac/Services/KokoroTokenizer.swift index ed51be2..4f1fa32 100644 --- a/VorleserMac/Services/KokoroTokenizer.swift +++ b/VorleserMac/Services/KokoroTokenizer.swift @@ -26,23 +26,22 @@ struct KokoroTokenizer { } func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) { - let cleanedScalars = phonemes.unicodeScalars.filter { $0.properties.generalCategory != .format } - let cleanedView = String.UnicodeScalarView(cleanedScalars) var ids: [Int] = [] var unknown: [String] = [] - for scalar in cleanedView { - let token = String(scalar) - if let id = vocab[token] { + let tokens = phonemes.split(whereSeparator: { $0.isWhitespace }) + for token in tokens { + let tokenStr = String(token) + if let id = vocab[tokenStr] { ids.append(id) } else { - unknown.append(token) + unknown.append(tokenStr) } if ids.count >= maxTokens { break } } - return (ids, unknown, cleanedView.count) + return (ids, unknown, tokens.count) } }