mirror of
https://gitlab.com/tildes/tildes.git
synced 2026-04-17 06:48:36 +02:00
Support Unicode 16.0 emoji via unicodedata2 package
See merge request tildes/tildes!159
This commit is contained in:
@@ -12,4 +12,5 @@ types-bleach
|
||||
types-python-dateutil
|
||||
types-redis
|
||||
types-requests
|
||||
unicodedata2
|
||||
webtest
|
||||
|
||||
@@ -109,6 +109,7 @@ types-python-dateutil==0.1.4
|
||||
types-redis==3.5.4
|
||||
types-requests==2.25.0
|
||||
typing-extensions==4.12.2
|
||||
unicodedata2==16.0.0
|
||||
urllib3==1.26.6
|
||||
venusian==3.0.0
|
||||
waitress==2.0.0
|
||||
|
||||
@@ -36,6 +36,7 @@ SQLAlchemy<1.4
|
||||
SQLAlchemy-Utils
|
||||
stripe
|
||||
titlecase
|
||||
unicodedata2
|
||||
webargs
|
||||
wrapt
|
||||
zope.sqlalchemy
|
||||
|
||||
@@ -67,6 +67,7 @@ tomli==1.2.3
|
||||
traitlets==5.0.5
|
||||
transaction==3.0.1
|
||||
translationstring==1.4
|
||||
unicodedata2==16.0.0
|
||||
urllib3==1.26.6
|
||||
venusian==3.0.0
|
||||
wcwidth==0.2.5
|
||||
|
||||
@@ -99,3 +99,8 @@ def test_consecutive_spaces_collapsed():
|
||||
"""Ensure runs of consecutive spaces are "collapsed" inside the string."""
|
||||
original = "I wanted to space this out"
|
||||
assert process_string(original) == "I wanted to space this out"
|
||||
|
||||
|
||||
def test_unicode_15_moose_kept():
|
||||
"""Ensure newer emoji introduced in Unicode 15 are kept."""
|
||||
assert process_string("🫎") == "🫎"
|
||||
|
||||
@@ -4,13 +4,13 @@
|
||||
"""Functions related to processing/manipulating strings."""
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from collections.abc import Iterator
|
||||
from typing import Optional
|
||||
from urllib.parse import quote
|
||||
from xml.etree.ElementTree import Element
|
||||
|
||||
from html5lib import HTMLParser
|
||||
import unicodedata2
|
||||
|
||||
|
||||
# regex for matching an entire word, handles words that include an apostrophe
|
||||
@@ -177,10 +177,11 @@ def simplify_string(original: str) -> str:
|
||||
|
||||
def _sanitize_characters(original: str) -> str:
|
||||
"""Process a string and filter/replace problematic unicode."""
|
||||
# pylint: disable=c-extension-no-member
|
||||
final_characters = []
|
||||
|
||||
for index, char in enumerate(original):
|
||||
category = unicodedata.category(char)
|
||||
category = unicodedata2.category(char)
|
||||
|
||||
if category.startswith("Z"):
|
||||
# "separator" chars - replace with a normal space
|
||||
@@ -195,8 +196,8 @@ def _sanitize_characters(original: str) -> str:
|
||||
# don't break certain emoji variants
|
||||
if char == "\u200D":
|
||||
try:
|
||||
before_category = unicodedata.category(final_characters[-1])
|
||||
after_category = unicodedata.category(original[index + 1])
|
||||
before_category = unicodedata2.category(final_characters[-1])
|
||||
after_category = unicodedata2.category(original[index + 1])
|
||||
except IndexError:
|
||||
continue
|
||||
|
||||
|
||||
Reference in New Issue
Block a user