Add emoji support to tokenizer

2022-07-17 16:14:02 -07:00 · 2022-07-17 16:14:02 -07:00 · bd0028a108
commit bd0028a108
parent 62c3c27ddd
1 changed files with 23 additions and 1 deletions
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@ -1,13 +1,35 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 try:
    import emoji
    has_emoji = True
 except ImportError:
    has_emoji = False
 def tokenize(text, max_ngram_length=1):
    """Convert a string to a list of lemmas."""
    text = text.lower()
    if has_emoji:
        parts = []
        highest_end = 0
        for emoji_part in emoji.emoji_list(text):
            parts += list(text[highest_end : emoji_part["match_start"]])
            parts.append(emoji_part["emoji"])
            highest_end = emoji_part["match_end"]
        parts += list(text[highest_end:])
        text = [part for part in parts if part]
    tokens = [""]
-    for char in text.lower():
+    for char in text:
        if char.isalpha() or char == "'":
            tokens[-1] += char
        elif has_emoji and emoji.is_emoji(char):
            tokens.append(char)
            tokens.append("")
        elif tokens[-1] != "":
            tokens.append("")