You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
50 lines
1.4 KiB
50 lines
1.4 KiB
# SPDX-License-Identifier: GPL-3.0-or-later |
|
|
|
from typing import List, Union |
|
|
|
try: |
|
import emoji |
|
|
|
has_emoji = True |
|
except ImportError: |
|
has_emoji = False |
|
|
|
|
|
def tokenize( |
|
text: str, max_ngram_length: int = 1, use_emoji: bool = True |
|
) -> List[str]: |
|
"""Convert a string to a list of lemmas.""" |
|
converted_text: Union[str, List[str]] = text.lower() |
|
|
|
if has_emoji and use_emoji: |
|
text = text.lower() |
|
parts = [] |
|
highest_end = 0 |
|
for emoji_part in emoji.emoji_list(text): |
|
parts += list(text[highest_end : emoji_part["match_start"]]) |
|
parts.append(emoji_part["emoji"]) |
|
highest_end = emoji_part["match_end"] |
|
parts += list(text[highest_end:]) |
|
converted_text = [part for part in parts if part] |
|
|
|
tokens = [""] |
|
|
|
for char in converted_text: |
|
if char.isalpha() or char == "'": |
|
tokens[-1] += char |
|
elif has_emoji and emoji.is_emoji(char): |
|
tokens.append(char) |
|
tokens.append("") |
|
elif tokens[-1] != "": |
|
tokens.append("") |
|
|
|
tokens = [string for string in tokens if string] |
|
|
|
if max_ngram_length == 1: |
|
return tokens |
|
else: |
|
ngrams = [] |
|
for ngram_length in range(1, max_ngram_length + 1): |
|
for index in range(len(tokens) + 1 - ngram_length): |
|
ngrams.append(" ".join(tokens[index : index + ngram_length])) |
|
return ngrams
|
|
|