Compare commits

..

2 Commits

Author SHA1 Message Date
b4766cb613
Bump version to 3.0.1 2022-11-25 19:44:32 -08:00
f1a1ed9e2a
Remove most emoji-optional code
Almost all of the code previously used to make the emoji module optional
is removed in this commit. It was always my intent to make the `emoji`
module a hard dependency in v3.0.0 and remove the code for making it
optional, but for some reason I remembered to do the former but not the
latter; in fact, I added emoji-optional code to the new model handling
code. I can't completely remove this code because 3.0.0 will not
successfully deserialize a model without the `has_emoji` field in the
JSON config options, but this commit removes as much as possible without
breaking the model format and API version.

See also issue #11
2022-11-25 19:39:31 -08:00
5 changed files with 26 additions and 40 deletions

View File

@ -5,7 +5,6 @@
from gptc.compiler import compile as compile
from gptc.classifier import Classifier as Classifier
from gptc.pack import pack as pack
from gptc.tokenizer import has_emoji as has_emoji
from gptc.model import Model as Model, deserialize as deserialize
from gptc.exceptions import (
GPTCError as GPTCError,

View File

@ -29,7 +29,6 @@ class Classifier:
self.model = model
model_ngrams = model.max_ngram_length
self.max_ngram_length = min(max_ngram_length, model_ngrams)
self.has_emoji = gptc.tokenizer.has_emoji and model.has_emoji
def confidence(self, text: str) -> Dict[str, float]:
"""Classify text with confidence.
@ -49,9 +48,7 @@ class Classifier:
model = self.model.weights
tokens = gptc.tokenizer.tokenize(
text, self.max_ngram_length, self.has_emoji
)
tokens = gptc.tokenizer.tokenize(text, self.max_ngram_length)
numbered_probs: Dict[int, float] = {}
for word in tokens:
try:

View File

@ -12,14 +12,10 @@ class Model:
weights: Dict[int, List[int]],
names: List[str],
max_ngram_length: int,
has_emoji: Union[None, bool] = None,
):
self.weights = weights
self.names = names
self.max_ngram_length = max_ngram_length
self.has_emoji = (
gptc.tokenizer.has_emoji if has_emoji is None else has_emoji
)
def serialize(self) -> bytes:
out = b"GPTC model v4\n"
@ -28,7 +24,16 @@ class Model:
{
"names": self.names,
"max_ngram_length": self.max_ngram_length,
"has_emoji": self.has_emoji,
"has_emoji": True,
# Due to an oversight in development, version 3.0.0 still
# had the code used to make emoji support optional, even
# though the `emoji` library was made a hard dependency.
# Part of this code checked whether or not the model
# supports emoji; deserialization would not work in 3.0.0
# if the model was compiled without this field. Emoji are
# always supported with 3.0.0 and newer when GPTC has been
# installed correctly, so this value should always be True.
# Related: #11
}
).encode("utf-8")
+ b"\n"
@ -57,14 +62,11 @@ def deserialize(encoded_model: bytes) -> Model:
try:
names = config["names"]
max_ngram_length = config["max_ngram_length"]
has_emoji = config["has_emoji"]
except KeyError:
raise InvalidModelError()
if not (
isinstance(names, list)
and isinstance(max_ngram_length, int)
and isinstance(has_emoji, bool)
isinstance(names, list) and isinstance(max_ngram_length, int)
) or not all([isinstance(name, str) for name in names]):
raise InvalidModelError()
@ -86,4 +88,4 @@ def deserialize(encoded_model: bytes) -> Model:
for code in weight_codes
}
return Model(weights, names, max_ngram_length, has_emoji)
return Model(weights, names, max_ngram_length)

View File

@ -3,22 +3,10 @@
from typing import List, Union
import hashlib
import base64
try:
import emoji
has_emoji = True
except ImportError:
has_emoji = False
import emoji
def tokenize(
text: str, max_ngram_length: int = 1, use_emoji: bool = True
) -> List[int]:
"""Convert a string to a list of lemmas."""
converted_text: Union[str, List[str]] = text.lower()
if has_emoji and use_emoji:
def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
text = text.lower()
parts = []
highest_end = 0
@ -34,7 +22,7 @@ def tokenize(
for char in converted_text:
if char.isalpha() or char == "'":
tokens[-1] += char
elif has_emoji and emoji.is_emoji(char):
elif emoji.is_emoji(char):
tokens.append(char)
tokens.append("")
elif tokens[-1] != "":

View File

@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "gptc"
version = "3.0.0"
version = "3.0.1"
description = "General-purpose text classifier"
readme = "README.md"
authors = [{ name = "Samuel Sloniker", email = "sam@kj7rrv.com"}]