diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index bfceec8..c9e9fd0 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -1,13 +1,35 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +try: + import emoji + + has_emoji = True +except ImportError: + has_emoji = False + def tokenize(text, max_ngram_length=1): """Convert a string to a list of lemmas.""" + text = text.lower() + + if has_emoji: + parts = [] + highest_end = 0 + for emoji_part in emoji.emoji_list(text): + parts += list(text[highest_end : emoji_part["match_start"]]) + parts.append(emoji_part["emoji"]) + highest_end = emoji_part["match_end"] + parts += list(text[highest_end:]) + text = [part for part in parts if part] + tokens = [""] - for char in text.lower(): + for char in text: if char.isalpha() or char == "'": tokens[-1] += char + elif has_emoji and emoji.is_emoji(char): + tokens.append(char) + tokens.append("") elif tokens[-1] != "": tokens.append("")