From 56550ca4575424ffba82e9800ae28cc49b6baec9 Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Sun, 16 Apr 2023 14:27:07 -0700 Subject: [PATCH] Remove Classifier objects Closes #16 --- README.md | 7 ----- gptc/__init__.py | 1 - gptc/__main__.py | 20 +------------- gptc/classifier.py | 68 ---------------------------------------------- 4 files changed, 1 insertion(+), 95 deletions(-) delete mode 100755 gptc/classifier.py diff --git a/README.md b/README.md index 33f2531..6eace01 100644 --- a/README.md +++ b/README.md @@ -112,13 +112,6 @@ GPTC. See `models/unpacked/` for an example of the format. -### `gptc.Classifier(model, max_ngram_length=1)` - -`Classifier` objects are deprecated starting with GPTC 3.1.0, and will be -removed in 4.0.0. See [the README from -3.0.2](https://git.kj7rrv.com/kj7rrv/gptc/src/tag/v3.0.1/README.md) if you need -documentation. - ## Ngrams GPTC optionally supports using ngrams to improve classification accuracy. They diff --git a/gptc/__init__.py b/gptc/__init__.py index fd24dce..05f5dc5 100644 --- a/gptc/__init__.py +++ b/gptc/__init__.py @@ -3,7 +3,6 @@ """General-Purpose Text Classifier""" from gptc.compiler import compile as compile -from gptc.classifier import Classifier as Classifier from gptc.pack import pack as pack from gptc.model import Model as Model, deserialize as deserialize from gptc.tokenizer import normalize as normalize diff --git a/gptc/__main__.py b/gptc/__main__.py index 9586299..9c87536 100644 --- a/gptc/__main__.py +++ b/gptc/__main__.py @@ -44,19 +44,6 @@ def main() -> None: type=int, default=1, ) - group = classify_parser.add_mutually_exclusive_group() - group.add_argument( - "-j", - "--json", - help="output confidence dict as JSON (default)", - action="store_true", - ) - group.add_argument( - "-c", - "--category", - help="output most likely category or `None`", - action="store_true", - ) check_parser = subparsers.add_parser( "check", help="check one word or ngram in model" @@ -88,12 +75,7 @@ def main() -> None: else: text = sys.stdin.read() - if args.category: - classifier = gptc.Classifier(model, args.max_ngram_length) - print(classifier.classify(text)) - else: - probabilities = model.confidence(text, args.max_ngram_length) - print(json.dumps(probabilities)) + print(json.dumps(model.confidence(text, args.max_ngram_length))) elif args.subparser_name == "check": with open(args.model, "rb") as f: model = gptc.deserialize(f) diff --git a/gptc/classifier.py b/gptc/classifier.py deleted file mode 100755 index f2e2bf4..0000000 --- a/gptc/classifier.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later - -import gptc.model -from typing import Dict, Union - - -class Classifier: - """A text classifier. - - Parameters - ---------- - model : dict - A compiled GPTC model. - - max_ngram_length : int - The maximum ngram length to use when tokenizing input. If this is - greater than the value used when the model was compiled, it will be - silently lowered to that value. - - Attributes - ---------- - model : dict - The model used. - - """ - - def __init__(self, model: gptc.model.Model, max_ngram_length: int = 1): - self.model = model - model_ngrams = model.max_ngram_length - self.max_ngram_length = min(max_ngram_length, model_ngrams) - - def confidence(self, text: str) -> Dict[str, float]: - """Classify text with confidence. - - Parameters - ---------- - text : str - The text to classify - - Returns - ------- - dict - {category:probability, category:probability...} or {} if no words - matching any categories in the model were found - - """ - return self.model.confidence(text, self.max_ngram_length) - - def classify(self, text: str) -> Union[str, None]: - """Classify text. - - Parameters - ---------- - text : str - The text to classify - - Returns - ------- - str or None - The most likely category, or None if no words matching any - category in the model were found. - - """ - probs: Dict[str, float] = self.confidence(text) - try: - return sorted(probs.items(), key=lambda x: x[1])[-1][0] - except IndexError: - return None