Add Classifier.confidence()

This commit is contained in:
Samuel Sloniker 2021-11-03 06:38:22 -07:00
parent ac8ef8db89
commit be543134bc
3 changed files with 55 additions and 20 deletions

View File

@ -9,8 +9,15 @@ GPTC provides both a CLI tool and a Python library.
`python -m gptc <modelfile>` `python -m gptc <modelfile>`
This will prompt for a string and classify it, outputting the category on This will prompt for a string and classify it, outputting the category on
stdout (or "None" if it cannot determine stdout (or "None" if it cannot determine anything).
anything).
Alternatively, if you need confidence data, use:
`python -m gptc -j <modelfile>`
This will print (in JSON) a dict of the format `{category: probability,
category:probability, ...}` to stdout.
### Compiling models ### Compiling models
gptc <raw model file> -c|--compile <compiled model file> gptc <raw model file> -c|--compile <compiled model file>
@ -19,9 +26,11 @@ anything).
### `gptc.Classifier(model)` ### `gptc.Classifier(model)`
Create a `Classifier` object using the given *compiled* model (as a dict, not Create a `Classifier` object using the given *compiled* model (as a dict, not
JSON). JSON).
#### `Classifier.confidence(text)`
Classify `text`. Returns a dict of the format `{category: probability,
category:probability, ...}`
#### `Classifier.classify(text)` #### `Classifier.classify(text)`
Classify `text` with GPTC using the model used to instantiate the Classify `text`. Returns the category into which the text is placed (as a
`Classifier`. Returns the category into which the text is placed (as a
string), or `None` when it cannot classify the text. string), or `None` when it cannot classify the text.
## `gptc.compile(raw_model)` ## `gptc.compile(raw_model)`
Compile a raw model (as a list, not JSON) and return the compiled model (as a Compile a raw model (as a list, not JSON) and return the compiled model (as a

View File

@ -7,6 +7,7 @@ import gptc
parser = argparse.ArgumentParser(description="General Purpose Text Classifier") parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
parser.add_argument('model', help='model to use') parser.add_argument('model', help='model to use')
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
parser.add_argument('-j', '--confidence', help='output confidence dict in json', action='store_true')
args = parser.parse_args() args = parser.parse_args()
with open(args.model, 'r') as f: with open(args.model, 'r') as f:
@ -20,4 +21,7 @@ else:
text = input('Text to analyse: ') text = input('Text to analyse: ')
else: else:
text = sys.stdin.read() text = sys.stdin.read()
if args.confidence:
print(json.dumps(classifier.confidence(text)))
else:
print(classifier.classify(text)) print(classifier.classify(text))

View File

@ -34,6 +34,40 @@ class Classifier:
warnings.warn("model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future") warnings.warn("model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future")
self.model = gptc.compiler.compile(raw_model) self.model = gptc.compiler.compile(raw_model)
def confidence(self, text):
"""Classify text with confidence.
Parameters
----------
text : str
The text to classify
Returns
-------
dict
{category:probability, category:probability...} or {} if no words
matching any categories in the model were found
"""
model = self.model
text = gptc.tokenizer.tokenize(text)
probs = {}
for word in text:
try:
total = sum(model[word].values())
for category, value in model[word].items():
try:
probs[category] += value / total
except KeyError:
probs[category] = value / total
except KeyError:
pass
total = sum(probs.values())
probs = {category: value/total for category, value in probs.items()}
return probs
def classify(self, text): def classify(self, text):
"""Classify text. """Classify text.
@ -45,23 +79,11 @@ class Classifier:
Returns Returns
------- -------
str or None str or None
The most likely category, or None if no guess was made. The most likely category, or None if no words matching any
category in the model were found.
""" """
probs = self.confidence(text)
model = self.model
text = gptc.tokenizer.tokenize(text)
probs = {}
for word in text:
try:
for category, value in model[word].items():
try:
probs[category] += value
except KeyError:
probs[category] = value
except KeyError:
pass
try: try:
return sorted(probs.items(), key=lambda x: x[1])[-1][0] return sorted(probs.items(), key=lambda x: x[1])[-1][0]
except IndexError: except IndexError: