Merge pull request #3 from kj7rrv/algo2

Add Classifier.confidence()
This commit is contained in:
Samuel Sloniker 2021-11-05 18:39:59 -07:00 committed by GitHub
commit b3d975f3d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 55 additions and 20 deletions

View File

@ -9,8 +9,15 @@ GPTC provides both a CLI tool and a Python library.
`python -m gptc <modelfile>`
This will prompt for a string and classify it, outputting the category on
stdout (or "None" if it cannot determine
anything).
stdout (or "None" if it cannot determine anything).
Alternatively, if you need confidence data, use:
`python -m gptc -j <modelfile>`
This will print (in JSON) a dict of the format `{category: probability,
category:probability, ...}` to stdout.
### Compiling models
gptc <raw model file> -c|--compile <compiled model file>
@ -19,9 +26,11 @@ anything).
### `gptc.Classifier(model)`
Create a `Classifier` object using the given *compiled* model (as a dict, not
JSON).
#### `Classifier.confidence(text)`
Classify `text`. Returns a dict of the format `{category: probability,
category:probability, ...}`
#### `Classifier.classify(text)`
Classify `text` with GPTC using the model used to instantiate the
`Classifier`. Returns the category into which the text is placed (as a
Classify `text`. Returns the category into which the text is placed (as a
string), or `None` when it cannot classify the text.
## `gptc.compile(raw_model)`
Compile a raw model (as a list, not JSON) and return the compiled model (as a

View File

@ -7,6 +7,7 @@ import gptc
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
parser.add_argument('model', help='model to use')
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
parser.add_argument('-j', '--confidence', help='output confidence dict in json', action='store_true')
args = parser.parse_args()
with open(args.model, 'r') as f:
@ -20,4 +21,7 @@ else:
text = input('Text to analyse: ')
else:
text = sys.stdin.read()
if args.confidence:
print(json.dumps(classifier.confidence(text)))
else:
print(classifier.classify(text))

View File

@ -34,6 +34,40 @@ class Classifier:
warnings.warn("model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future")
self.model = gptc.compiler.compile(raw_model)
def confidence(self, text):
"""Classify text with confidence.
Parameters
----------
text : str
The text to classify
Returns
-------
dict
{category:probability, category:probability...} or {} if no words
matching any categories in the model were found
"""
model = self.model
text = gptc.tokenizer.tokenize(text)
probs = {}
for word in text:
try:
total = sum(model[word].values())
for category, value in model[word].items():
try:
probs[category] += value / total
except KeyError:
probs[category] = value / total
except KeyError:
pass
total = sum(probs.values())
probs = {category: value/total for category, value in probs.items()}
return probs
def classify(self, text):
"""Classify text.
@ -45,23 +79,11 @@ class Classifier:
Returns
-------
str or None
The most likely category, or None if no guess was made.
The most likely category, or None if no words matching any
category in the model were found.
"""
model = self.model
text = gptc.tokenizer.tokenize(text)
probs = {}
for word in text:
try:
for category, value in model[word].items():
try:
probs[category] += value
except KeyError:
probs[category] = value
except KeyError:
pass
probs = self.confidence(text)
try:
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
except IndexError: