Normalize characters

Closes #3
This commit is contained in:
Samuel Sloniker 2022-12-24 10:46:40 -08:00
parent 295a1189de
commit 10668691ea
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62

View File

@ -3,10 +3,11 @@
from typing import List, Union
import hashlib
import emoji
import unicodedata
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
text = text.lower()
text = unicodedata.normalize("NFKD", text).lower()
parts = []
highest_end = 0
for emoji_part in emoji.emoji_list(text):