Remove most emoji-optional code

Almost all of the code previously used to make the emoji module optional
is removed in this commit. It was always my intent to make the `emoji`
module a hard dependency in v3.0.0 and remove the code for making it
optional, but for some reason I remembered to do the former but not the
latter; in fact, I added emoji-optional code to the new model handling
code. I can't completely remove this code because 3.0.0 will not
successfully deserialize a model without the `has_emoji` field in the
JSON config options, but this commit removes as much as possible without
breaking the model format and API version.

See also issue #11
This commit is contained in:
Samuel Sloniker 2022-11-25 19:28:30 -08:00
parent 7ecb7dd90a
commit f1a1ed9e2a
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62
4 changed files with 25 additions and 39 deletions

View File

@ -5,7 +5,6 @@
from gptc.compiler import compile as compile from gptc.compiler import compile as compile
from gptc.classifier import Classifier as Classifier from gptc.classifier import Classifier as Classifier
from gptc.pack import pack as pack from gptc.pack import pack as pack
from gptc.tokenizer import has_emoji as has_emoji
from gptc.model import Model as Model, deserialize as deserialize from gptc.model import Model as Model, deserialize as deserialize
from gptc.exceptions import ( from gptc.exceptions import (
GPTCError as GPTCError, GPTCError as GPTCError,

View File

@ -29,7 +29,6 @@ class Classifier:
self.model = model self.model = model
model_ngrams = model.max_ngram_length model_ngrams = model.max_ngram_length
self.max_ngram_length = min(max_ngram_length, model_ngrams) self.max_ngram_length = min(max_ngram_length, model_ngrams)
self.has_emoji = gptc.tokenizer.has_emoji and model.has_emoji
def confidence(self, text: str) -> Dict[str, float]: def confidence(self, text: str) -> Dict[str, float]:
"""Classify text with confidence. """Classify text with confidence.
@ -49,9 +48,7 @@ class Classifier:
model = self.model.weights model = self.model.weights
tokens = gptc.tokenizer.tokenize( tokens = gptc.tokenizer.tokenize(text, self.max_ngram_length)
text, self.max_ngram_length, self.has_emoji
)
numbered_probs: Dict[int, float] = {} numbered_probs: Dict[int, float] = {}
for word in tokens: for word in tokens:
try: try:

View File

@ -12,14 +12,10 @@ class Model:
weights: Dict[int, List[int]], weights: Dict[int, List[int]],
names: List[str], names: List[str],
max_ngram_length: int, max_ngram_length: int,
has_emoji: Union[None, bool] = None,
): ):
self.weights = weights self.weights = weights
self.names = names self.names = names
self.max_ngram_length = max_ngram_length self.max_ngram_length = max_ngram_length
self.has_emoji = (
gptc.tokenizer.has_emoji if has_emoji is None else has_emoji
)
def serialize(self) -> bytes: def serialize(self) -> bytes:
out = b"GPTC model v4\n" out = b"GPTC model v4\n"
@ -28,7 +24,16 @@ class Model:
{ {
"names": self.names, "names": self.names,
"max_ngram_length": self.max_ngram_length, "max_ngram_length": self.max_ngram_length,
"has_emoji": self.has_emoji, "has_emoji": True,
# Due to an oversight in development, version 3.0.0 still
# had the code used to make emoji support optional, even
# though the `emoji` library was made a hard dependency.
# Part of this code checked whether or not the model
# supports emoji; deserialization would not work in 3.0.0
# if the model was compiled without this field. Emoji are
# always supported with 3.0.0 and newer when GPTC has been
# installed correctly, so this value should always be True.
# Related: #11
} }
).encode("utf-8") ).encode("utf-8")
+ b"\n" + b"\n"
@ -57,14 +62,11 @@ def deserialize(encoded_model: bytes) -> Model:
try: try:
names = config["names"] names = config["names"]
max_ngram_length = config["max_ngram_length"] max_ngram_length = config["max_ngram_length"]
has_emoji = config["has_emoji"]
except KeyError: except KeyError:
raise InvalidModelError() raise InvalidModelError()
if not ( if not (
isinstance(names, list) isinstance(names, list) and isinstance(max_ngram_length, int)
and isinstance(max_ngram_length, int)
and isinstance(has_emoji, bool)
) or not all([isinstance(name, str) for name in names]): ) or not all([isinstance(name, str) for name in names]):
raise InvalidModelError() raise InvalidModelError()
@ -86,4 +88,4 @@ def deserialize(encoded_model: bytes) -> Model:
for code in weight_codes for code in weight_codes
} }
return Model(weights, names, max_ngram_length, has_emoji) return Model(weights, names, max_ngram_length)

View File

@ -3,38 +3,26 @@
from typing import List, Union from typing import List, Union
import hashlib import hashlib
import base64 import base64
import emoji
try:
import emoji
has_emoji = True
except ImportError:
has_emoji = False
def tokenize( def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
text: str, max_ngram_length: int = 1, use_emoji: bool = True text = text.lower()
) -> List[int]: parts = []
"""Convert a string to a list of lemmas.""" highest_end = 0
converted_text: Union[str, List[str]] = text.lower() for emoji_part in emoji.emoji_list(text):
parts += list(text[highest_end : emoji_part["match_start"]])
if has_emoji and use_emoji: parts.append(emoji_part["emoji"])
text = text.lower() highest_end = emoji_part["match_end"]
parts = [] parts += list(text[highest_end:])
highest_end = 0 converted_text = [part for part in parts if part]
for emoji_part in emoji.emoji_list(text):
parts += list(text[highest_end : emoji_part["match_start"]])
parts.append(emoji_part["emoji"])
highest_end = emoji_part["match_end"]
parts += list(text[highest_end:])
converted_text = [part for part in parts if part]
tokens = [""] tokens = [""]
for char in converted_text: for char in converted_text:
if char.isalpha() or char == "'": if char.isalpha() or char == "'":
tokens[-1] += char tokens[-1] += char
elif has_emoji and emoji.is_emoji(char): elif emoji.is_emoji(char):
tokens.append(char) tokens.append(char)
tokens.append("") tokens.append("")
elif tokens[-1] != "": elif tokens[-1] != "":