Add emoji support to tokenizer
This commit is contained in:
parent
62c3c27ddd
commit
bd0028a108
|
@ -1,13 +1,35 @@
|
||||||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||||
|
|
||||||
|
try:
|
||||||
|
import emoji
|
||||||
|
|
||||||
|
has_emoji = True
|
||||||
|
except ImportError:
|
||||||
|
has_emoji = False
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text, max_ngram_length=1):
|
def tokenize(text, max_ngram_length=1):
|
||||||
"""Convert a string to a list of lemmas."""
|
"""Convert a string to a list of lemmas."""
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
if has_emoji:
|
||||||
|
parts = []
|
||||||
|
highest_end = 0
|
||||||
|
for emoji_part in emoji.emoji_list(text):
|
||||||
|
parts += list(text[highest_end : emoji_part["match_start"]])
|
||||||
|
parts.append(emoji_part["emoji"])
|
||||||
|
highest_end = emoji_part["match_end"]
|
||||||
|
parts += list(text[highest_end:])
|
||||||
|
text = [part for part in parts if part]
|
||||||
|
|
||||||
tokens = [""]
|
tokens = [""]
|
||||||
|
|
||||||
for char in text.lower():
|
for char in text:
|
||||||
if char.isalpha() or char == "'":
|
if char.isalpha() or char == "'":
|
||||||
tokens[-1] += char
|
tokens[-1] += char
|
||||||
|
elif has_emoji and emoji.is_emoji(char):
|
||||||
|
tokens.append(char)
|
||||||
|
tokens.append("")
|
||||||
elif tokens[-1] != "":
|
elif tokens[-1] != "":
|
||||||
tokens.append("")
|
tokens.append("")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user