From bd0028a108619fe78a8dc60f37514c3c48fb4edf Mon Sep 17 00:00:00 2001
From: Samuel Sloniker <sam@kj7rrv.com>
Date: Sun, 17 Jul 2022 16:14:02 -0700
Subject: [PATCH] Add emoji support to tokenizer

---
 gptc/tokenizer.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py
index bfceec8..c9e9fd0 100644
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@@ -1,13 +1,35 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 
+try:
+    import emoji
+
+    has_emoji = True
+except ImportError:
+    has_emoji = False
+
 
 def tokenize(text, max_ngram_length=1):
     """Convert a string to a list of lemmas."""
+    text = text.lower()
+
+    if has_emoji:
+        parts = []
+        highest_end = 0
+        for emoji_part in emoji.emoji_list(text):
+            parts += list(text[highest_end : emoji_part["match_start"]])
+            parts.append(emoji_part["emoji"])
+            highest_end = emoji_part["match_end"]
+        parts += list(text[highest_end:])
+        text = [part for part in parts if part]
+
     tokens = [""]
 
-    for char in text.lower():
+    for char in text:
         if char.isalpha() or char == "'":
             tokens[-1] += char
+        elif has_emoji and emoji.is_emoji(char):
+            tokens.append(char)
+            tokens.append("")
         elif tokens[-1] != "":
             tokens.append("")