diff --git a/gptc/__init__.py b/gptc/__init__.py index feb7b85..b970c05 100644 --- a/gptc/__init__.py +++ b/gptc/__init__.py @@ -5,7 +5,6 @@ from gptc.compiler import compile as compile from gptc.classifier import Classifier as Classifier from gptc.pack import pack as pack -from gptc.tokenizer import has_emoji as has_emoji from gptc.model import Model as Model, deserialize as deserialize from gptc.exceptions import ( GPTCError as GPTCError, diff --git a/gptc/classifier.py b/gptc/classifier.py index e18bad5..de9e0a8 100755 --- a/gptc/classifier.py +++ b/gptc/classifier.py @@ -29,7 +29,6 @@ class Classifier: self.model = model model_ngrams = model.max_ngram_length self.max_ngram_length = min(max_ngram_length, model_ngrams) - self.has_emoji = gptc.tokenizer.has_emoji and model.has_emoji def confidence(self, text: str) -> Dict[str, float]: """Classify text with confidence. @@ -49,9 +48,7 @@ class Classifier: model = self.model.weights - tokens = gptc.tokenizer.tokenize( - text, self.max_ngram_length, self.has_emoji - ) + tokens = gptc.tokenizer.tokenize(text, self.max_ngram_length) numbered_probs: Dict[int, float] = {} for word in tokens: try: diff --git a/gptc/model.py b/gptc/model.py index a55ec32..014189b 100644 --- a/gptc/model.py +++ b/gptc/model.py @@ -12,14 +12,10 @@ class Model: weights: Dict[int, List[int]], names: List[str], max_ngram_length: int, - has_emoji: Union[None, bool] = None, ): self.weights = weights self.names = names self.max_ngram_length = max_ngram_length - self.has_emoji = ( - gptc.tokenizer.has_emoji if has_emoji is None else has_emoji - ) def serialize(self) -> bytes: out = b"GPTC model v4\n" @@ -28,7 +24,16 @@ class Model: { "names": self.names, "max_ngram_length": self.max_ngram_length, - "has_emoji": self.has_emoji, + "has_emoji": True, + # Due to an oversight in development, version 3.0.0 still + # had the code used to make emoji support optional, even + # though the `emoji` library was made a hard dependency. + # Part of this code checked whether or not the model + # supports emoji; deserialization would not work in 3.0.0 + # if the model was compiled without this field. Emoji are + # always supported with 3.0.0 and newer when GPTC has been + # installed correctly, so this value should always be True. + # Related: #11 } ).encode("utf-8") + b"\n" @@ -57,14 +62,11 @@ def deserialize(encoded_model: bytes) -> Model: try: names = config["names"] max_ngram_length = config["max_ngram_length"] - has_emoji = config["has_emoji"] except KeyError: raise InvalidModelError() if not ( - isinstance(names, list) - and isinstance(max_ngram_length, int) - and isinstance(has_emoji, bool) + isinstance(names, list) and isinstance(max_ngram_length, int) ) or not all([isinstance(name, str) for name in names]): raise InvalidModelError() @@ -86,4 +88,4 @@ def deserialize(encoded_model: bytes) -> Model: for code in weight_codes } - return Model(weights, names, max_ngram_length, has_emoji) + return Model(weights, names, max_ngram_length) diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index a9405da..67499db 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -3,38 +3,26 @@ from typing import List, Union import hashlib import base64 - -try: - import emoji - - has_emoji = True -except ImportError: - has_emoji = False +import emoji -def tokenize( - text: str, max_ngram_length: int = 1, use_emoji: bool = True -) -> List[int]: - """Convert a string to a list of lemmas.""" - converted_text: Union[str, List[str]] = text.lower() - - if has_emoji and use_emoji: - text = text.lower() - parts = [] - highest_end = 0 - for emoji_part in emoji.emoji_list(text): - parts += list(text[highest_end : emoji_part["match_start"]]) - parts.append(emoji_part["emoji"]) - highest_end = emoji_part["match_end"] - parts += list(text[highest_end:]) - converted_text = [part for part in parts if part] +def tokenize(text: str, max_ngram_length: int = 1) -> List[int]: + text = text.lower() + parts = [] + highest_end = 0 + for emoji_part in emoji.emoji_list(text): + parts += list(text[highest_end : emoji_part["match_start"]]) + parts.append(emoji_part["emoji"]) + highest_end = emoji_part["match_end"] + parts += list(text[highest_end:]) + converted_text = [part for part in parts if part] tokens = [""] for char in converted_text: if char.isalpha() or char == "'": tokens[-1] += char - elif has_emoji and emoji.is_emoji(char): + elif emoji.is_emoji(char): tokens.append(char) tokens.append("") elif tokens[-1] != "":