Remove most emoji-optional code

Almost all of the code previously used to make the emoji module optional is removed in this commit. It was always my intent to make the `emoji` module a hard dependency in v3.0.0 and remove the code for making it optional, but for some reason I remembered to do the former but not the latter; in fact, I added emoji-optional code to the new model handling code. I can't completely remove this code because 3.0.0 will not successfully deserialize a model without the `has_emoji` field in the JSON config options, but this commit removes as much as possible without breaking the model format and API version. See also issue #11
2022-11-25 19:28:30 -08:00 · 2022-11-25 19:28:30 -08:00 · f1a1ed9e2a
commit f1a1ed9e2a
parent 7ecb7dd90a
4 changed files with 25 additions and 39 deletions
--- a/gptc/init.py
+++ b/gptc/init.py
@ -5,7 +5,6 @@
 from gptc.compiler import compile as compile
 from gptc.classifier import Classifier as Classifier
 from gptc.pack import pack as pack
 from gptc.tokenizer import has_emoji as has_emoji
 from gptc.model import Model as Model, deserialize as deserialize
 from gptc.exceptions import (
    GPTCError as GPTCError,
--- a/gptc/classifier.py
+++ b/gptc/classifier.py
@ -29,7 +29,6 @@ class Classifier:
        self.model = model
        model_ngrams = model.max_ngram_length
        self.max_ngram_length = min(max_ngram_length, model_ngrams)
        self.has_emoji = gptc.tokenizer.has_emoji and model.has_emoji
    def confidence(self, text: str) -> Dict[str, float]:
        """Classify text with confidence.
@ -49,9 +48,7 @@ class Classifier:
        model = self.model.weights
-        tokens = gptc.tokenizer.tokenize(
+        tokens = gptc.tokenizer.tokenize(text, self.max_ngram_length)
            text, self.max_ngram_length, self.has_emoji
        )
        numbered_probs: Dict[int, float] = {}
        for word in tokens:
            try:
--- a/gptc/model.py
+++ b/gptc/model.py
@ -12,14 +12,10 @@ class Model:
        weights: Dict[int, List[int]],
        names: List[str],
        max_ngram_length: int,
        has_emoji: Union[None, bool] = None,
    ):
        self.weights = weights
        self.names = names
        self.max_ngram_length = max_ngram_length
        self.has_emoji = (
            gptc.tokenizer.has_emoji if has_emoji is None else has_emoji
        )
    def serialize(self) -> bytes:
        out = b"GPTC model v4\n"
@ -28,7 +24,16 @@ class Model:
                {
                    "names": self.names,
                    "max_ngram_length": self.max_ngram_length,
-                    "has_emoji": self.has_emoji,
+                    "has_emoji": True,
                    # Due to an oversight in development, version 3.0.0 still
                    # had the code used to make emoji support optional, even
                    # though the `emoji` library was made a hard dependency.
                    # Part of this code checked whether or not the model
                    # supports emoji; deserialization would not work in 3.0.0
                    # if the model was compiled without this field. Emoji are
                    # always supported with 3.0.0 and newer when GPTC has been
                    # installed correctly, so this value should always be True.
                    # Related: #11
                }
            ).encode("utf-8")
            + b"\n"
@ -57,14 +62,11 @@ def deserialize(encoded_model: bytes) -> Model:
    try:
        names = config["names"]
        max_ngram_length = config["max_ngram_length"]
        has_emoji = config["has_emoji"]
    except KeyError:
        raise InvalidModelError()
    if not (
-        isinstance(names, list)
+        isinstance(names, list) and isinstance(max_ngram_length, int)
        and isinstance(max_ngram_length, int)
        and isinstance(has_emoji, bool)
    ) or not all([isinstance(name, str) for name in names]):
        raise InvalidModelError()
@ -86,4 +88,4 @@ def deserialize(encoded_model: bytes) -> Model:
        for code in weight_codes
    }
-    return Model(weights, names, max_ngram_length, has_emoji)
+    return Model(weights, names, max_ngram_length)
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@ -3,38 +3,26 @@
 from typing import List, Union
 import hashlib
 import base64
-
+import emoji
 try:
    import emoji
    has_emoji = True
 except ImportError:
    has_emoji = False
-def tokenize(
+def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
-    text: str, max_ngram_length: int = 1, use_emoji: bool = True
+    text = text.lower()
-) -> List[int]:
+    parts = []
-    """Convert a string to a list of lemmas."""
+    highest_end = 0
-    converted_text: Union[str, List[str]] = text.lower()
+    for emoji_part in emoji.emoji_list(text):
-
+        parts += list(text[highest_end : emoji_part["match_start"]])
-    if has_emoji and use_emoji:
+        parts.append(emoji_part["emoji"])
-        text = text.lower()
+        highest_end = emoji_part["match_end"]
-        parts = []
+    parts += list(text[highest_end:])
-        highest_end = 0
+    converted_text = [part for part in parts if part]
        for emoji_part in emoji.emoji_list(text):
            parts += list(text[highest_end : emoji_part["match_start"]])
            parts.append(emoji_part["emoji"])
            highest_end = emoji_part["match_end"]
        parts += list(text[highest_end:])
        converted_text = [part for part in parts if part]
    tokens = [""]
    for char in converted_text:
        if char.isalpha() or char == "'":
            tokens[-1] += char
-        elif has_emoji and emoji.is_emoji(char):
+        elif emoji.is_emoji(char):
            tokens.append(char)
            tokens.append("")
        elif tokens[-1] != "":