Add emoji support to tokenizer

This commit is contained in:
Samuel Sloniker 2022-07-17 16:14:02 -07:00
parent 62c3c27ddd
commit bd0028a108

View File

@ -1,13 +1,35 @@
# SPDX-License-Identifier: LGPL-3.0-or-later # SPDX-License-Identifier: LGPL-3.0-or-later
try:
import emoji
has_emoji = True
except ImportError:
has_emoji = False
def tokenize(text, max_ngram_length=1): def tokenize(text, max_ngram_length=1):
"""Convert a string to a list of lemmas.""" """Convert a string to a list of lemmas."""
text = text.lower()
if has_emoji:
parts = []
highest_end = 0
for emoji_part in emoji.emoji_list(text):
parts += list(text[highest_end : emoji_part["match_start"]])
parts.append(emoji_part["emoji"])
highest_end = emoji_part["match_end"]
parts += list(text[highest_end:])
text = [part for part in parts if part]
tokens = [""] tokens = [""]
for char in text.lower(): for char in text:
if char.isalpha() or char == "'": if char.isalpha() or char == "'":
tokens[-1] += char tokens[-1] += char
elif has_emoji and emoji.is_emoji(char):
tokens.append(char)
tokens.append("")
elif tokens[-1] != "": elif tokens[-1] != "":
tokens.append("") tokens.append("")