parent
75fdb5ba3c
commit
56550ca457
|
@ -112,13 +112,6 @@ GPTC.
|
|||
|
||||
See `models/unpacked/` for an example of the format.
|
||||
|
||||
### `gptc.Classifier(model, max_ngram_length=1)`
|
||||
|
||||
`Classifier` objects are deprecated starting with GPTC 3.1.0, and will be
|
||||
removed in 4.0.0. See [the README from
|
||||
3.0.2](https://git.kj7rrv.com/kj7rrv/gptc/src/tag/v3.0.1/README.md) if you need
|
||||
documentation.
|
||||
|
||||
## Ngrams
|
||||
|
||||
GPTC optionally supports using ngrams to improve classification accuracy. They
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
"""General-Purpose Text Classifier"""
|
||||
|
||||
from gptc.compiler import compile as compile
|
||||
from gptc.classifier import Classifier as Classifier
|
||||
from gptc.pack import pack as pack
|
||||
from gptc.model import Model as Model, deserialize as deserialize
|
||||
from gptc.tokenizer import normalize as normalize
|
||||
|
|
|
@ -44,19 +44,6 @@ def main() -> None:
|
|||
type=int,
|
||||
default=1,
|
||||
)
|
||||
group = classify_parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
"-j",
|
||||
"--json",
|
||||
help="output confidence dict as JSON (default)",
|
||||
action="store_true",
|
||||
)
|
||||
group.add_argument(
|
||||
"-c",
|
||||
"--category",
|
||||
help="output most likely category or `None`",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
check_parser = subparsers.add_parser(
|
||||
"check", help="check one word or ngram in model"
|
||||
|
@ -88,12 +75,7 @@ def main() -> None:
|
|||
else:
|
||||
text = sys.stdin.read()
|
||||
|
||||
if args.category:
|
||||
classifier = gptc.Classifier(model, args.max_ngram_length)
|
||||
print(classifier.classify(text))
|
||||
else:
|
||||
probabilities = model.confidence(text, args.max_ngram_length)
|
||||
print(json.dumps(probabilities))
|
||||
print(json.dumps(model.confidence(text, args.max_ngram_length)))
|
||||
elif args.subparser_name == "check":
|
||||
with open(args.model, "rb") as f:
|
||||
model = gptc.deserialize(f)
|
||||
|
|
|
@ -1,68 +0,0 @@
|
|||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
import gptc.model
|
||||
from typing import Dict, Union
|
||||
|
||||
|
||||
class Classifier:
|
||||
"""A text classifier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model : dict
|
||||
A compiled GPTC model.
|
||||
|
||||
max_ngram_length : int
|
||||
The maximum ngram length to use when tokenizing input. If this is
|
||||
greater than the value used when the model was compiled, it will be
|
||||
silently lowered to that value.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
model : dict
|
||||
The model used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, model: gptc.model.Model, max_ngram_length: int = 1):
|
||||
self.model = model
|
||||
model_ngrams = model.max_ngram_length
|
||||
self.max_ngram_length = min(max_ngram_length, model_ngrams)
|
||||
|
||||
def confidence(self, text: str) -> Dict[str, float]:
|
||||
"""Classify text with confidence.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
The text to classify
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
{category:probability, category:probability...} or {} if no words
|
||||
matching any categories in the model were found
|
||||
|
||||
"""
|
||||
return self.model.confidence(text, self.max_ngram_length)
|
||||
|
||||
def classify(self, text: str) -> Union[str, None]:
|
||||
"""Classify text.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
The text to classify
|
||||
|
||||
Returns
|
||||
-------
|
||||
str or None
|
||||
The most likely category, or None if no words matching any
|
||||
category in the model were found.
|
||||
|
||||
"""
|
||||
probs: Dict[str, float] = self.confidence(text)
|
||||
try:
|
||||
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
|
||||
except IndexError:
|
||||
return None
|
Loading…
Reference in New Issue
Block a user