Lightweight NLP library in pure Python - currently implements a text classifier
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

50 lines
1.4 KiB

# SPDX-License-Identifier: GPL-3.0-or-later
from typing import List, Union
try:
import emoji
has_emoji = True
except ImportError:
has_emoji = False
def tokenize(
text: str, max_ngram_length: int = 1, use_emoji: bool = True
) -> List[str]:
"""Convert a string to a list of lemmas."""
converted_text: Union[str, List[str]] = text.lower()
if has_emoji and use_emoji:
text = text.lower()
parts = []
highest_end = 0
for emoji_part in emoji.emoji_list(text):
parts += list(text[highest_end : emoji_part["match_start"]])
parts.append(emoji_part["emoji"])
highest_end = emoji_part["match_end"]
parts += list(text[highest_end:])
converted_text = [part for part in parts if part]
tokens = [""]
for char in converted_text:
if char.isalpha() or char == "'":
tokens[-1] += char
elif has_emoji and emoji.is_emoji(char):
tokens.append(char)
tokens.append("")
elif tokens[-1] != "":
tokens.append("")
tokens = [string for string in tokens if string]
if max_ngram_length == 1:
return tokens
else:
ngrams = []
for ngram_length in range(1, max_ngram_length + 1):
for index in range(len(tokens) + 1 - ngram_length):
ngrams.append(" ".join(tokens[index : index + ngram_length]))
return ngrams