revert tokenizer to character-by-character, add debug output
This commit is contained in:
@@ -26,22 +26,28 @@ struct KokoroTokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) {
|
func tokenize(_ phonemes: String, maxTokens: Int = 128) -> (ids: [Int], unknown: [String], scalarCount: Int) {
|
||||||
|
let cleanedScalars = phonemes.unicodeScalars.filter { $0.properties.generalCategory != .format }
|
||||||
|
let cleanedView = String.UnicodeScalarView(cleanedScalars)
|
||||||
var ids: [Int] = []
|
var ids: [Int] = []
|
||||||
var unknown: [String] = []
|
var unknown: [String] = []
|
||||||
|
|
||||||
let tokens = phonemes.split(whereSeparator: { $0.isWhitespace })
|
for scalar in cleanedView {
|
||||||
for token in tokens {
|
let token = String(scalar)
|
||||||
let tokenStr = String(token)
|
if let id = vocab[token] {
|
||||||
if let id = vocab[tokenStr] {
|
|
||||||
ids.append(id)
|
ids.append(id)
|
||||||
} else {
|
} else {
|
||||||
unknown.append(tokenStr)
|
unknown.append(token)
|
||||||
}
|
}
|
||||||
if ids.count >= maxTokens {
|
if ids.count >= maxTokens {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (ids, unknown, tokens.count)
|
if !unknown.isEmpty {
|
||||||
|
print("Tokenizer: \(unknown.count) unknown tokens: \(unknown.prefix(30))")
|
||||||
|
}
|
||||||
|
print("Tokenizer: \(ids.count) ids from \(cleanedView.count) scalars, unknown: \(unknown.count)")
|
||||||
|
|
||||||
|
return (ids, unknown, cleanedView.count)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user