fix tokenizer to split on whitespace for proper phoneme tokens

2026-02-19 10:33:46 +01:00
parent 7098f318e9
commit 217eb78e87
1 changed files with 6 additions and 7 deletions
--- a/VorleserMac/Services/KokoroTokenizer.swift
+++ b/VorleserMac/Services/KokoroTokenizer.swift
@@ -26,23 +26,22 @@ struct KokoroTokenizer {
    }

    func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) {
-        let cleanedScalars = phonemes.unicodeScalars.filter { $0.properties.generalCategory != .format }
-        let cleanedView = String.UnicodeScalarView(cleanedScalars)
        var ids: [Int] = []
        var unknown: [String] = []

-        for scalar in cleanedView {
-            let token = String(scalar)
-            if let id = vocab[token] {
+        let tokens = phonemes.split(whereSeparator: { $0.isWhitespace })
+        for token in tokens {
+            let tokenStr = String(token)
+            if let id = vocab[tokenStr] {
                ids.append(id)
            } else {
-                unknown.append(token)
+                unknown.append(tokenStr)
            }
            if ids.count >= maxTokens {
                break
            }
        }

-        return (ids, unknown, cleanedView.count)
+        return (ids, unknown, tokens.count)
    }
 }