mirror of
https://gitlab.com/tildes/tildes.git
synced 2026-04-16 06:18:34 +02:00
Handle zero width joiner unicode chars for emoji
Some emoji variants require a zero-width joiner, and they were being broken by the current code that stripped them out.
This commit is contained in:
@@ -77,6 +77,18 @@ def test_control_chars_removed():
|
||||
assert result == "I can be sneaky and add problemchars."
|
||||
|
||||
|
||||
def test_zero_width_joiners_kept_and_collapsed():
|
||||
""""Ensure that multiple zero width joiners are collapsed like spaces."""
|
||||
original = "🤷\u200D\u200D\u200D♀\u200d"
|
||||
assert process_string(original) == "🤷\u200D♀"
|
||||
|
||||
|
||||
def test_zero_width_joiners_allowed_inside_emojis_and_not_other_words():
|
||||
""""Ensure the zero width joiner char is kept inside emojis."""
|
||||
original = "🤷\u200D♀ foo\u200dbar"
|
||||
assert process_string(original) == "🤷\u200D♀ foobar"
|
||||
|
||||
|
||||
def test_leading_trailing_spaces_removed():
|
||||
"""Ensure leading/trailing spaces are removed from the string."""
|
||||
original = " Centered! "
|
||||
|
||||
@@ -78,3 +78,13 @@ def test_unicode_control_chars_removed(title_schema):
|
||||
title = "nothing\u0000strange\u0085going\u009con\u007fhere"
|
||||
result = title_schema.load({"title": title})
|
||||
assert result["title"] == "nothingstrangegoingonhere"
|
||||
|
||||
|
||||
def test_zero_width_joiner_emojis_kept(title_schema):
|
||||
"""Test that emojis are parsed correctly"""
|
||||
title = "🤷🤷♂️🤷♀️🤷🏻🤷🏻♀️🤷🏻♂️🤷🏼🤷🏼♀️🤷🏼♂️🤷🏽🤷🏽♀️🤷🏽♂️🤷🏾🤷🏾♀️🤷🏾♂️🤷🏿🤷🏿♀️🤷🏿♂️"
|
||||
result = title_schema.load({"title": title})
|
||||
assert (
|
||||
result["title"]
|
||||
== "🤷🤷♂️🤷♀️🤷🏻🤷🏻♀️🤷🏻♂️🤷🏼🤷🏼♀️🤷🏼♂️🤷🏽🤷🏽♀️🤷🏽♂️🤷🏾🤷🏾♀️🤷🏾♂️🤷🏿🤷🏿♀️🤷🏿♂️"
|
||||
)
|
||||
|
||||
@@ -178,7 +178,7 @@ def _sanitize_characters(original: str) -> str:
|
||||
"""Process a string and filter/replace problematic unicode."""
|
||||
final_characters = []
|
||||
|
||||
for char in original:
|
||||
for index, char in enumerate(original):
|
||||
category = unicodedata.category(char)
|
||||
|
||||
if category.startswith("Z"):
|
||||
@@ -189,6 +189,19 @@ def _sanitize_characters(original: str) -> str:
|
||||
# newlines, which are replaced with normal spaces
|
||||
if char == "\n":
|
||||
final_characters.append(" ")
|
||||
elif char == "\u200D":
|
||||
final_length = len(final_characters)
|
||||
# only check for the ZWJ if it's between two characters
|
||||
if final_length <= index < len(original) - 1:
|
||||
char_before_category = unicodedata.category(
|
||||
final_characters[final_length - 1]
|
||||
)
|
||||
char_after_category = unicodedata.category(original[index + 1])
|
||||
# only keep the ZWJ if it's between two symbol characters
|
||||
if char_before_category.startswith(
|
||||
"S"
|
||||
) and char_after_category.startswith("S"):
|
||||
final_characters.append("\u200D")
|
||||
else:
|
||||
# any other type of character, just keep it
|
||||
final_characters.append(char)
|
||||
|
||||
Reference in New Issue
Block a user