Compare commits

...

2 Commits

Author SHA1 Message Date
7250787228 Bump version to 2.1.1 2022-07-20 14:06:56 -07:00
9538cf8c22 Fix emoji handling 2022-07-20 14:06:27 -07:00
5 changed files with 13 additions and 6 deletions

View File

@ -33,7 +33,9 @@ class Classifier:
self.model = model
model_ngrams = cast(int, model.get("__ngrams__", 1))
self.max_ngram_length = min(max_ngram_length, model_ngrams)
self.has_emoji = gptc.tokenizer.has_emoji and gptc.model_info.model_has_emoji(model)
self.has_emoji = (
gptc.tokenizer.has_emoji and gptc.model_info.model_has_emoji(model)
)
def confidence(self, text: str) -> Dict[str, float]:
"""Classify text with confidence.
@ -53,7 +55,9 @@ class Classifier:
model = self.model
tokens = gptc.tokenizer.tokenize(text, self.max_ngram_length)
tokens = gptc.tokenizer.tokenize(
text, self.max_ngram_length, self.has_emoji
)
numbered_probs: Dict[int, float] = {}
for word in tokens:
try:

View File

@ -77,5 +77,6 @@ def compile(
model["__names__"] = names
model["__ngrams__"] = max_ngram_length
model["__version__"] = 3
model["__emoji__"] = int(tokenizer.has_emoji)
return model

View File

@ -5,4 +5,4 @@ from typing import Dict, Union, cast, List
def model_has_emoji(model: gptc.compiler.MODEL) -> bool:
return cast(int, model.get("__emoji__]", 0)) == 1
return cast(int, model.get("__emoji__", 0)) == 1

View File

@ -10,11 +10,13 @@ except ImportError:
has_emoji = False
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
def tokenize(
text: str, max_ngram_length: int = 1, use_emoji: bool = True
) -> List[str]:
"""Convert a string to a list of lemmas."""
converted_text: Union[str, List[str]] = text.lower()
if has_emoji:
if has_emoji and use_emoji:
parts = []
highest_end = 0
for emoji_part in emoji.emoji_list(text):

View File

@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "gptc"
version = "2.1.0"
version = "2.1.1"
description = "General-purpose text classifier"
readme = "README.md"
authors = [{ name = "Samuel Sloniker", email = "sam@kj7rrv.com"}]