From bd0028a108619fe78a8dc60f37514c3c48fb4edf Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Sun, 17 Jul 2022 16:14:02 -0700 Subject: [PATCH] Add emoji support to tokenizer --- gptc/tokenizer.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index bfceec8..c9e9fd0 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -1,13 +1,35 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +try: + import emoji + + has_emoji = True +except ImportError: + has_emoji = False + def tokenize(text, max_ngram_length=1): """Convert a string to a list of lemmas.""" + text = text.lower() + + if has_emoji: + parts = [] + highest_end = 0 + for emoji_part in emoji.emoji_list(text): + parts += list(text[highest_end : emoji_part["match_start"]]) + parts.append(emoji_part["emoji"]) + highest_end = emoji_part["match_end"] + parts += list(text[highest_end:]) + text = [part for part in parts if part] + tokens = [""] - for char in text.lower(): + for char in text: if char.isalpha() or char == "'": tokens[-1] += char + elif has_emoji and emoji.is_emoji(char): + tokens.append(char) + tokens.append("") elif tokens[-1] != "": tokens.append("")