Compare commits

..

7 Commits

8 changed files with 103 additions and 50 deletions

View File

@ -4,6 +4,12 @@ General-purpose text classifier in Python
GPTC provides both a CLI tool and a Python library. GPTC provides both a CLI tool and a Python library.
## Installation
pip install gptc[emoji] # handles emojis! (see section "Emoji")
# Or, if you don't need emoji support,
pip install gptc # no dependencies!
## CLI Tool ## CLI Tool
### Classifying text ### Classifying text
@ -72,6 +78,12 @@ reduced to the one used when compiling the model.
Models compiled with older versions of GPTC which did not support ngrams are Models compiled with older versions of GPTC which did not support ngrams are
handled the same way as models compiled with `max_ngram_length=1`. handled the same way as models compiled with `max_ngram_length=1`.
## Emoji
If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC
will automatically handle emojis the same way as words. If it is not installed,
GPTC will still work but will ignore emojis.
## Model format ## Model format
This section explains the raw model format, which is how you should create and This section explains the raw model format, which is how you should create and

View File

@ -33,7 +33,9 @@ print(
) )
classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length) classifier = gptc.Classifier(
gptc.compile(raw_model, max_ngram_length), max_ngram_length
)
print( print(
"Average classification time over", "Average classification time over",
classify_iterations, classify_iterations,

View File

@ -2,6 +2,10 @@
"""General-Purpose Text Classifier""" """General-Purpose Text Classifier"""
from gptc.compiler import compile from gptc.compiler import compile as compile
from gptc.classifier import Classifier from gptc.classifier import Classifier as Classifier
from gptc.exceptions import * from gptc.exceptions import (
GPTCError as GPTCError,
ModelError as ModelError,
UnsupportedModelError as UnsupportedModelError,
)

View File

@ -6,7 +6,8 @@ import json
import sys import sys
import gptc import gptc
def main():
def main() -> None:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="General Purpose Text Classifier", prog="gptc" description="General Purpose Text Classifier", prog="gptc"
) )
@ -14,11 +15,23 @@ def main():
compile_parser = subparsers.add_parser("compile", help="compile a raw model") compile_parser = subparsers.add_parser("compile", help="compile a raw model")
compile_parser.add_argument("model", help="raw model to compile") compile_parser.add_argument("model", help="raw model to compile")
compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1) compile_parser.add_argument(
"--max-ngram-length",
"-n",
help="maximum ngram length",
type=int,
default=1,
)
classify_parser = subparsers.add_parser("classify", help="classify text") classify_parser = subparsers.add_parser("classify", help="classify text")
classify_parser.add_argument("model", help="compiled model to use") classify_parser.add_argument("model", help="compiled model to use")
classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1) classify_parser.add_argument(
"--max-ngram-length",
"-n",
help="maximum ngram length",
type=int,
default=1,
)
group = classify_parser.add_mutually_exclusive_group() group = classify_parser.add_mutually_exclusive_group()
group.add_argument( group.add_argument(
"-j", "-j",

View File

@ -2,6 +2,7 @@
import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting
import warnings import warnings
from typing import Dict, Union, cast, List
class Classifier: class Classifier:
@ -24,17 +25,14 @@ class Classifier:
""" """
def __init__(self, model, max_ngram_length=1): def __init__(self, model: gptc.compiler.MODEL, max_ngram_length: int = 1):
if model.get("__version__", 0) != 3: if model.get("__version__", 0) != 3:
raise gptc.exceptions.UnsupportedModelError( raise gptc.exceptions.UnsupportedModelError(f"unsupported model version")
f"unsupported model version"
)
self.model = model self.model = model
self.max_ngram_length = min( model_ngrams = cast(int, model.get("__ngrams__", 1))
max_ngram_length, model.get("__ngrams__", 1) self.max_ngram_length = min(max_ngram_length, model_ngrams)
)
def confidence(self, text): def confidence(self, text: str) -> Dict[str, float]:
"""Classify text with confidence. """Classify text with confidence.
Parameters Parameters
@ -52,29 +50,28 @@ class Classifier:
model = self.model model = self.model
text = gptc.tokenizer.tokenize(text, self.max_ngram_length) tokens = gptc.tokenizer.tokenize(text, self.max_ngram_length)
probs = {} numbered_probs: Dict[int, float] = {}
for word in text: for word in tokens:
try: try:
weight, weighted_numbers = gptc.weighting.weight( weighted_numbers = gptc.weighting.weight(
[i / 65535 for i in model[word]] [i / 65535 for i in cast(List[float], model[word])]
) )
for category, value in enumerate(weighted_numbers): for category, value in enumerate(weighted_numbers):
try: try:
probs[category] += value numbered_probs[category] += value
except KeyError: except KeyError:
probs[category] = value numbered_probs[category] = value
except KeyError: except KeyError:
pass pass
probs = { total = sum(numbered_probs.values())
model["__names__"][category]: value probs: Dict[str, float] = {
for category, value in probs.items() cast(List[str], model["__names__"])[category]: value / total
for category, value in numbered_probs.items()
} }
total = sum(probs.values())
probs = {category: value / total for category, value in probs.items()}
return probs return probs
def classify(self, text): def classify(self, text: str) -> Union[str, None]:
"""Classify text. """Classify text.
Parameters Parameters
@ -89,7 +86,7 @@ class Classifier:
category in the model were found. category in the model were found.
""" """
probs = self.confidence(text) probs: Dict[str, float] = self.confidence(text)
try: try:
return sorted(probs.items(), key=lambda x: x[1])[-1][0] return sorted(probs.items(), key=lambda x: x[1])[-1][0]
except IndexError: except IndexError:

View File

@ -1,9 +1,14 @@
# SPDX-License-Identifier: LGPL-3.0-or-later # SPDX-License-Identifier: LGPL-3.0-or-later
import gptc.tokenizer import gptc.tokenizer
from typing import Iterable, Mapping, List, Dict, Union
WEIGHTS_T = List[int]
CONFIG_T = Union[List[str], int, str]
MODEL = Dict[str, Union[WEIGHTS_T, CONFIG_T]]
def compile(raw_model, max_ngram_length=1): def compile(raw_model: Iterable[Mapping[str, str]], max_ngram_length: int = 1) -> MODEL:
"""Compile a raw model. """Compile a raw model.
Parameters Parameters
@ -21,7 +26,7 @@ def compile(raw_model, max_ngram_length=1):
""" """
categories = {} categories: Dict[str, List[str]] = {}
for portion in raw_model: for portion in raw_model:
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length) text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
@ -31,7 +36,7 @@ def compile(raw_model, max_ngram_length=1):
except KeyError: except KeyError:
categories[category] = text categories[category] = text
categories_by_count = {} categories_by_count: Dict[str, Dict[str, float]] = {}
names = [] names = []
@ -42,14 +47,10 @@ def compile(raw_model, max_ngram_length=1):
categories_by_count[category] = {} categories_by_count[category] = {}
for word in text: for word in text:
try: try:
categories_by_count[category][word] += 1 / len( categories_by_count[category][word] += 1 / len(categories[category])
categories[category]
)
except KeyError: except KeyError:
categories_by_count[category][word] = 1 / len( categories_by_count[category][word] = 1 / len(categories[category])
categories[category] word_weights: Dict[str, Dict[str, float]] = {}
)
word_weights = {}
for category, words in categories_by_count.items(): for category, words in categories_by_count.items():
for word, value in words.items(): for word, value in words.items():
try: try:
@ -57,14 +58,13 @@ def compile(raw_model, max_ngram_length=1):
except KeyError: except KeyError:
word_weights[word] = {category: value} word_weights[word] = {category: value}
model = {} model: MODEL = {}
for word, weights in word_weights.items(): for word, weights in word_weights.items():
total = sum(weights.values()) total = sum(weights.values())
model[word] = [] new_weights: List[int] = []
for category in names: for category in names:
model[word].append( new_weights.append(round((weights.get(category, 0) / total) * 65535))
round((weights.get(category, 0) / total) * 65535) model[word] = new_weights
)
model["__names__"] = names model["__names__"] = names
model["__ngrams__"] = max_ngram_length model["__ngrams__"] = max_ngram_length

View File

@ -1,13 +1,37 @@
# SPDX-License-Identifier: LGPL-3.0-or-later # SPDX-License-Identifier: LGPL-3.0-or-later
from typing import List, Union
def tokenize(text, max_ngram_length=1): try:
import emoji
has_emoji = True
except ImportError:
has_emoji = False
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
"""Convert a string to a list of lemmas.""" """Convert a string to a list of lemmas."""
converted_text: Union[str, List[str]] = text.lower()
if has_emoji:
parts = []
highest_end = 0
for emoji_part in emoji.emoji_list(text):
parts += list(text[highest_end : emoji_part["match_start"]])
parts.append(emoji_part["emoji"])
highest_end = emoji_part["match_end"]
parts += list(text[highest_end:])
converted_text = [part for part in parts if part]
tokens = [""] tokens = [""]
for char in text.lower(): for char in converted_text:
if char.isalpha() or char == "'": if char.isalpha() or char == "'":
tokens[-1] += char tokens[-1] += char
elif has_emoji and emoji.is_emoji(char):
tokens.append(char)
tokens.append("")
elif tokens[-1] != "": elif tokens[-1] != "":
tokens.append("") tokens.append("")

View File

@ -1,9 +1,10 @@
# SPDX-License-Identifier: LGPL-3.0-or-later # SPDX-License-Identifier: LGPL-3.0-or-later
import math import math
from typing import Sequence, Union, Tuple, List
def _mean(numbers): def _mean(numbers: Sequence[float]) -> float:
"""Calculate the mean of a group of numbers """Calculate the mean of a group of numbers
Parameters Parameters
@ -19,7 +20,7 @@ def _mean(numbers):
return sum(numbers) / len(numbers) return sum(numbers) / len(numbers)
def _standard_deviation(numbers): def _standard_deviation(numbers: Sequence[float]) -> float:
"""Calculate the standard deviation of a group of numbers """Calculate the standard deviation of a group of numbers
Parameters Parameters
@ -38,8 +39,8 @@ def _standard_deviation(numbers):
return math.sqrt(_mean(squared_deviations)) return math.sqrt(_mean(squared_deviations))
def weight(numbers): def weight(numbers: Sequence[float]) -> List[float]:
standard_deviation = _standard_deviation(numbers) standard_deviation = _standard_deviation(numbers)
weight = standard_deviation * 2 weight = standard_deviation * 2
weighted_numbers = [i * weight for i in numbers] weighted_numbers = [i * weight for i in numbers]
return weight, weighted_numbers return weighted_numbers