Compare commits
7 Commits
62c3c27ddd
...
e711767d24
Author | SHA1 | Date | |
---|---|---|---|
e711767d24 | |||
67ac3a4591 | |||
b36d8e6081 | |||
48639f5d8d | |||
a207e281e7 | |||
e272ab42d1 | |||
bd0028a108 |
12
README.md
12
README.md
|
@ -4,6 +4,12 @@ General-purpose text classifier in Python
|
||||||
|
|
||||||
GPTC provides both a CLI tool and a Python library.
|
GPTC provides both a CLI tool and a Python library.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
pip install gptc[emoji] # handles emojis! (see section "Emoji")
|
||||||
|
# Or, if you don't need emoji support,
|
||||||
|
pip install gptc # no dependencies!
|
||||||
|
|
||||||
## CLI Tool
|
## CLI Tool
|
||||||
|
|
||||||
### Classifying text
|
### Classifying text
|
||||||
|
@ -72,6 +78,12 @@ reduced to the one used when compiling the model.
|
||||||
Models compiled with older versions of GPTC which did not support ngrams are
|
Models compiled with older versions of GPTC which did not support ngrams are
|
||||||
handled the same way as models compiled with `max_ngram_length=1`.
|
handled the same way as models compiled with `max_ngram_length=1`.
|
||||||
|
|
||||||
|
## Emoji
|
||||||
|
|
||||||
|
If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC
|
||||||
|
will automatically handle emojis the same way as words. If it is not installed,
|
||||||
|
GPTC will still work but will ignore emojis.
|
||||||
|
|
||||||
## Model format
|
## Model format
|
||||||
|
|
||||||
This section explains the raw model format, which is how you should create and
|
This section explains the raw model format, which is how you should create and
|
||||||
|
|
|
@ -33,7 +33,9 @@ print(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
|
classifier = gptc.Classifier(
|
||||||
|
gptc.compile(raw_model, max_ngram_length), max_ngram_length
|
||||||
|
)
|
||||||
print(
|
print(
|
||||||
"Average classification time over",
|
"Average classification time over",
|
||||||
classify_iterations,
|
classify_iterations,
|
||||||
|
|
|
@ -2,6 +2,10 @@
|
||||||
|
|
||||||
"""General-Purpose Text Classifier"""
|
"""General-Purpose Text Classifier"""
|
||||||
|
|
||||||
from gptc.compiler import compile
|
from gptc.compiler import compile as compile
|
||||||
from gptc.classifier import Classifier
|
from gptc.classifier import Classifier as Classifier
|
||||||
from gptc.exceptions import *
|
from gptc.exceptions import (
|
||||||
|
GPTCError as GPTCError,
|
||||||
|
ModelError as ModelError,
|
||||||
|
UnsupportedModelError as UnsupportedModelError,
|
||||||
|
)
|
||||||
|
|
|
@ -6,7 +6,8 @@ import json
|
||||||
import sys
|
import sys
|
||||||
import gptc
|
import gptc
|
||||||
|
|
||||||
def main():
|
|
||||||
|
def main() -> None:
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="General Purpose Text Classifier", prog="gptc"
|
description="General Purpose Text Classifier", prog="gptc"
|
||||||
)
|
)
|
||||||
|
@ -14,11 +15,23 @@ def main():
|
||||||
|
|
||||||
compile_parser = subparsers.add_parser("compile", help="compile a raw model")
|
compile_parser = subparsers.add_parser("compile", help="compile a raw model")
|
||||||
compile_parser.add_argument("model", help="raw model to compile")
|
compile_parser.add_argument("model", help="raw model to compile")
|
||||||
compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
|
compile_parser.add_argument(
|
||||||
|
"--max-ngram-length",
|
||||||
|
"-n",
|
||||||
|
help="maximum ngram length",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
)
|
||||||
|
|
||||||
classify_parser = subparsers.add_parser("classify", help="classify text")
|
classify_parser = subparsers.add_parser("classify", help="classify text")
|
||||||
classify_parser.add_argument("model", help="compiled model to use")
|
classify_parser.add_argument("model", help="compiled model to use")
|
||||||
classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
|
classify_parser.add_argument(
|
||||||
|
"--max-ngram-length",
|
||||||
|
"-n",
|
||||||
|
help="maximum ngram length",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
)
|
||||||
group = classify_parser.add_mutually_exclusive_group()
|
group = classify_parser.add_mutually_exclusive_group()
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"-j",
|
"-j",
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting
|
import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting
|
||||||
import warnings
|
import warnings
|
||||||
|
from typing import Dict, Union, cast, List
|
||||||
|
|
||||||
|
|
||||||
class Classifier:
|
class Classifier:
|
||||||
|
@ -24,17 +25,14 @@ class Classifier:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, model, max_ngram_length=1):
|
def __init__(self, model: gptc.compiler.MODEL, max_ngram_length: int = 1):
|
||||||
if model.get("__version__", 0) != 3:
|
if model.get("__version__", 0) != 3:
|
||||||
raise gptc.exceptions.UnsupportedModelError(
|
raise gptc.exceptions.UnsupportedModelError(f"unsupported model version")
|
||||||
f"unsupported model version"
|
|
||||||
)
|
|
||||||
self.model = model
|
self.model = model
|
||||||
self.max_ngram_length = min(
|
model_ngrams = cast(int, model.get("__ngrams__", 1))
|
||||||
max_ngram_length, model.get("__ngrams__", 1)
|
self.max_ngram_length = min(max_ngram_length, model_ngrams)
|
||||||
)
|
|
||||||
|
|
||||||
def confidence(self, text):
|
def confidence(self, text: str) -> Dict[str, float]:
|
||||||
"""Classify text with confidence.
|
"""Classify text with confidence.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
@ -52,29 +50,28 @@ class Classifier:
|
||||||
|
|
||||||
model = self.model
|
model = self.model
|
||||||
|
|
||||||
text = gptc.tokenizer.tokenize(text, self.max_ngram_length)
|
tokens = gptc.tokenizer.tokenize(text, self.max_ngram_length)
|
||||||
probs = {}
|
numbered_probs: Dict[int, float] = {}
|
||||||
for word in text:
|
for word in tokens:
|
||||||
try:
|
try:
|
||||||
weight, weighted_numbers = gptc.weighting.weight(
|
weighted_numbers = gptc.weighting.weight(
|
||||||
[i / 65535 for i in model[word]]
|
[i / 65535 for i in cast(List[float], model[word])]
|
||||||
)
|
)
|
||||||
for category, value in enumerate(weighted_numbers):
|
for category, value in enumerate(weighted_numbers):
|
||||||
try:
|
try:
|
||||||
probs[category] += value
|
numbered_probs[category] += value
|
||||||
except KeyError:
|
except KeyError:
|
||||||
probs[category] = value
|
numbered_probs[category] = value
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
probs = {
|
total = sum(numbered_probs.values())
|
||||||
model["__names__"][category]: value
|
probs: Dict[str, float] = {
|
||||||
for category, value in probs.items()
|
cast(List[str], model["__names__"])[category]: value / total
|
||||||
|
for category, value in numbered_probs.items()
|
||||||
}
|
}
|
||||||
total = sum(probs.values())
|
|
||||||
probs = {category: value / total for category, value in probs.items()}
|
|
||||||
return probs
|
return probs
|
||||||
|
|
||||||
def classify(self, text):
|
def classify(self, text: str) -> Union[str, None]:
|
||||||
"""Classify text.
|
"""Classify text.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
@ -89,7 +86,7 @@ class Classifier:
|
||||||
category in the model were found.
|
category in the model were found.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
probs = self.confidence(text)
|
probs: Dict[str, float] = self.confidence(text)
|
||||||
try:
|
try:
|
||||||
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
|
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||||
|
|
||||||
import gptc.tokenizer
|
import gptc.tokenizer
|
||||||
|
from typing import Iterable, Mapping, List, Dict, Union
|
||||||
|
|
||||||
|
WEIGHTS_T = List[int]
|
||||||
|
CONFIG_T = Union[List[str], int, str]
|
||||||
|
MODEL = Dict[str, Union[WEIGHTS_T, CONFIG_T]]
|
||||||
|
|
||||||
|
|
||||||
def compile(raw_model, max_ngram_length=1):
|
def compile(raw_model: Iterable[Mapping[str, str]], max_ngram_length: int = 1) -> MODEL:
|
||||||
"""Compile a raw model.
|
"""Compile a raw model.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
@ -21,7 +26,7 @@ def compile(raw_model, max_ngram_length=1):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
categories = {}
|
categories: Dict[str, List[str]] = {}
|
||||||
|
|
||||||
for portion in raw_model:
|
for portion in raw_model:
|
||||||
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
||||||
|
@ -31,7 +36,7 @@ def compile(raw_model, max_ngram_length=1):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
categories[category] = text
|
categories[category] = text
|
||||||
|
|
||||||
categories_by_count = {}
|
categories_by_count: Dict[str, Dict[str, float]] = {}
|
||||||
|
|
||||||
names = []
|
names = []
|
||||||
|
|
||||||
|
@ -42,14 +47,10 @@ def compile(raw_model, max_ngram_length=1):
|
||||||
categories_by_count[category] = {}
|
categories_by_count[category] = {}
|
||||||
for word in text:
|
for word in text:
|
||||||
try:
|
try:
|
||||||
categories_by_count[category][word] += 1 / len(
|
categories_by_count[category][word] += 1 / len(categories[category])
|
||||||
categories[category]
|
|
||||||
)
|
|
||||||
except KeyError:
|
except KeyError:
|
||||||
categories_by_count[category][word] = 1 / len(
|
categories_by_count[category][word] = 1 / len(categories[category])
|
||||||
categories[category]
|
word_weights: Dict[str, Dict[str, float]] = {}
|
||||||
)
|
|
||||||
word_weights = {}
|
|
||||||
for category, words in categories_by_count.items():
|
for category, words in categories_by_count.items():
|
||||||
for word, value in words.items():
|
for word, value in words.items():
|
||||||
try:
|
try:
|
||||||
|
@ -57,14 +58,13 @@ def compile(raw_model, max_ngram_length=1):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
word_weights[word] = {category: value}
|
word_weights[word] = {category: value}
|
||||||
|
|
||||||
model = {}
|
model: MODEL = {}
|
||||||
for word, weights in word_weights.items():
|
for word, weights in word_weights.items():
|
||||||
total = sum(weights.values())
|
total = sum(weights.values())
|
||||||
model[word] = []
|
new_weights: List[int] = []
|
||||||
for category in names:
|
for category in names:
|
||||||
model[word].append(
|
new_weights.append(round((weights.get(category, 0) / total) * 65535))
|
||||||
round((weights.get(category, 0) / total) * 65535)
|
model[word] = new_weights
|
||||||
)
|
|
||||||
|
|
||||||
model["__names__"] = names
|
model["__names__"] = names
|
||||||
model["__ngrams__"] = max_ngram_length
|
model["__ngrams__"] = max_ngram_length
|
||||||
|
|
|
@ -1,13 +1,37 @@
|
||||||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||||
|
|
||||||
|
from typing import List, Union
|
||||||
|
|
||||||
def tokenize(text, max_ngram_length=1):
|
try:
|
||||||
|
import emoji
|
||||||
|
|
||||||
|
has_emoji = True
|
||||||
|
except ImportError:
|
||||||
|
has_emoji = False
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
||||||
"""Convert a string to a list of lemmas."""
|
"""Convert a string to a list of lemmas."""
|
||||||
|
converted_text: Union[str, List[str]] = text.lower()
|
||||||
|
|
||||||
|
if has_emoji:
|
||||||
|
parts = []
|
||||||
|
highest_end = 0
|
||||||
|
for emoji_part in emoji.emoji_list(text):
|
||||||
|
parts += list(text[highest_end : emoji_part["match_start"]])
|
||||||
|
parts.append(emoji_part["emoji"])
|
||||||
|
highest_end = emoji_part["match_end"]
|
||||||
|
parts += list(text[highest_end:])
|
||||||
|
converted_text = [part for part in parts if part]
|
||||||
|
|
||||||
tokens = [""]
|
tokens = [""]
|
||||||
|
|
||||||
for char in text.lower():
|
for char in converted_text:
|
||||||
if char.isalpha() or char == "'":
|
if char.isalpha() or char == "'":
|
||||||
tokens[-1] += char
|
tokens[-1] += char
|
||||||
|
elif has_emoji and emoji.is_emoji(char):
|
||||||
|
tokens.append(char)
|
||||||
|
tokens.append("")
|
||||||
elif tokens[-1] != "":
|
elif tokens[-1] != "":
|
||||||
tokens.append("")
|
tokens.append("")
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
from typing import Sequence, Union, Tuple, List
|
||||||
|
|
||||||
|
|
||||||
def _mean(numbers):
|
def _mean(numbers: Sequence[float]) -> float:
|
||||||
"""Calculate the mean of a group of numbers
|
"""Calculate the mean of a group of numbers
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
@ -19,7 +20,7 @@ def _mean(numbers):
|
||||||
return sum(numbers) / len(numbers)
|
return sum(numbers) / len(numbers)
|
||||||
|
|
||||||
|
|
||||||
def _standard_deviation(numbers):
|
def _standard_deviation(numbers: Sequence[float]) -> float:
|
||||||
"""Calculate the standard deviation of a group of numbers
|
"""Calculate the standard deviation of a group of numbers
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
@ -38,8 +39,8 @@ def _standard_deviation(numbers):
|
||||||
return math.sqrt(_mean(squared_deviations))
|
return math.sqrt(_mean(squared_deviations))
|
||||||
|
|
||||||
|
|
||||||
def weight(numbers):
|
def weight(numbers: Sequence[float]) -> List[float]:
|
||||||
standard_deviation = _standard_deviation(numbers)
|
standard_deviation = _standard_deviation(numbers)
|
||||||
weight = standard_deviation * 2
|
weight = standard_deviation * 2
|
||||||
weighted_numbers = [i * weight for i in numbers]
|
weighted_numbers = [i * weight for i in numbers]
|
||||||
return weight, weighted_numbers
|
return weighted_numbers
|
||||||
|
|
Loading…
Reference in New Issue
Block a user