Add emoji support to tokenizer
This commit is contained in:
parent
62c3c27ddd
commit
bd0028a108
|
@ -1,13 +1,35 @@
|
|||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||
|
||||
try:
|
||||
import emoji
|
||||
|
||||
has_emoji = True
|
||||
except ImportError:
|
||||
has_emoji = False
|
||||
|
||||
|
||||
def tokenize(text, max_ngram_length=1):
|
||||
"""Convert a string to a list of lemmas."""
|
||||
text = text.lower()
|
||||
|
||||
if has_emoji:
|
||||
parts = []
|
||||
highest_end = 0
|
||||
for emoji_part in emoji.emoji_list(text):
|
||||
parts += list(text[highest_end : emoji_part["match_start"]])
|
||||
parts.append(emoji_part["emoji"])
|
||||
highest_end = emoji_part["match_end"]
|
||||
parts += list(text[highest_end:])
|
||||
text = [part for part in parts if part]
|
||||
|
||||
tokens = [""]
|
||||
|
||||
for char in text.lower():
|
||||
for char in text:
|
||||
if char.isalpha() or char == "'":
|
||||
tokens[-1] += char
|
||||
elif has_emoji and emoji.is_emoji(char):
|
||||
tokens.append(char)
|
||||
tokens.append("")
|
||||
elif tokens[-1] != "":
|
||||
tokens.append("")
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user