revert tokenizer to character-by-character, add debug output
This commit is contained in:
@@ -26,22 +26,28 @@ struct KokoroTokenizer {
|
||||
}
|
||||
|
||||
func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) {
|
||||
let cleanedScalars = phonemes.unicodeScalars.filter { $0.properties.generalCategory != .format }
|
||||
let cleanedView = String.UnicodeScalarView(cleanedScalars)
|
||||
var ids: [Int] = []
|
||||
var unknown: [String] = []
|
||||
|
||||
let tokens = phonemes.split(whereSeparator: { $0.isWhitespace })
|
||||
for token in tokens {
|
||||
let tokenStr = String(token)
|
||||
if let id = vocab[tokenStr] {
|
||||
for scalar in cleanedView {
|
||||
let token = String(scalar)
|
||||
if let id = vocab[token] {
|
||||
ids.append(id)
|
||||
} else {
|
||||
unknown.append(tokenStr)
|
||||
unknown.append(token)
|
||||
}
|
||||
if ids.count >= maxTokens {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return (ids, unknown, tokens.count)
|
||||
if !unknown.isEmpty {
|
||||
print("Tokenizer: \(unknown.count) unknown tokens: \(unknown.prefix(30))")
|
||||
}
|
||||
print("Tokenizer: \(ids.count) ids from \(cleanedView.count) scalars, unknown: \(unknown.count)")
|
||||
|
||||
return (ids, unknown, cleanedView.count)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user