Add Classifier.confidence()
This commit is contained in:
parent
ac8ef8db89
commit
be543134bc
17
README.md
17
README.md
|
@ -9,8 +9,15 @@ GPTC provides both a CLI tool and a Python library.
|
||||||
`python -m gptc <modelfile>`
|
`python -m gptc <modelfile>`
|
||||||
|
|
||||||
This will prompt for a string and classify it, outputting the category on
|
This will prompt for a string and classify it, outputting the category on
|
||||||
stdout (or "None" if it cannot determine
|
stdout (or "None" if it cannot determine anything).
|
||||||
anything).
|
|
||||||
|
Alternatively, if you need confidence data, use:
|
||||||
|
|
||||||
|
`python -m gptc -j <modelfile>`
|
||||||
|
|
||||||
|
This will print (in JSON) a dict of the format `{category: probability,
|
||||||
|
category:probability, ...}` to stdout.
|
||||||
|
|
||||||
### Compiling models
|
### Compiling models
|
||||||
|
|
||||||
gptc <raw model file> -c|--compile <compiled model file>
|
gptc <raw model file> -c|--compile <compiled model file>
|
||||||
|
@ -19,9 +26,11 @@ anything).
|
||||||
### `gptc.Classifier(model)`
|
### `gptc.Classifier(model)`
|
||||||
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
||||||
JSON).
|
JSON).
|
||||||
|
#### `Classifier.confidence(text)`
|
||||||
|
Classify `text`. Returns a dict of the format `{category: probability,
|
||||||
|
category:probability, ...}`
|
||||||
#### `Classifier.classify(text)`
|
#### `Classifier.classify(text)`
|
||||||
Classify `text` with GPTC using the model used to instantiate the
|
Classify `text`. Returns the category into which the text is placed (as a
|
||||||
`Classifier`. Returns the category into which the text is placed (as a
|
|
||||||
string), or `None` when it cannot classify the text.
|
string), or `None` when it cannot classify the text.
|
||||||
## `gptc.compile(raw_model)`
|
## `gptc.compile(raw_model)`
|
||||||
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
||||||
|
|
|
@ -7,6 +7,7 @@ import gptc
|
||||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
||||||
parser.add_argument('model', help='model to use')
|
parser.add_argument('model', help='model to use')
|
||||||
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
|
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
|
||||||
|
parser.add_argument('-j', '--confidence', help='output confidence dict in json', action='store_true')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with open(args.model, 'r') as f:
|
with open(args.model, 'r') as f:
|
||||||
|
@ -20,4 +21,7 @@ else:
|
||||||
text = input('Text to analyse: ')
|
text = input('Text to analyse: ')
|
||||||
else:
|
else:
|
||||||
text = sys.stdin.read()
|
text = sys.stdin.read()
|
||||||
|
if args.confidence:
|
||||||
|
print(json.dumps(classifier.confidence(text)))
|
||||||
|
else:
|
||||||
print(classifier.classify(text))
|
print(classifier.classify(text))
|
||||||
|
|
|
@ -34,6 +34,40 @@ class Classifier:
|
||||||
warnings.warn("model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future")
|
warnings.warn("model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future")
|
||||||
self.model = gptc.compiler.compile(raw_model)
|
self.model = gptc.compiler.compile(raw_model)
|
||||||
|
|
||||||
|
def confidence(self, text):
|
||||||
|
"""Classify text with confidence.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : str
|
||||||
|
The text to classify
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict
|
||||||
|
{category:probability, category:probability...} or {} if no words
|
||||||
|
matching any categories in the model were found
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
model = self.model
|
||||||
|
|
||||||
|
text = gptc.tokenizer.tokenize(text)
|
||||||
|
probs = {}
|
||||||
|
for word in text:
|
||||||
|
try:
|
||||||
|
total = sum(model[word].values())
|
||||||
|
for category, value in model[word].items():
|
||||||
|
try:
|
||||||
|
probs[category] += value / total
|
||||||
|
except KeyError:
|
||||||
|
probs[category] = value / total
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
total = sum(probs.values())
|
||||||
|
probs = {category: value/total for category, value in probs.items()}
|
||||||
|
return probs
|
||||||
|
|
||||||
def classify(self, text):
|
def classify(self, text):
|
||||||
"""Classify text.
|
"""Classify text.
|
||||||
|
|
||||||
|
@ -45,23 +79,11 @@ class Classifier:
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
str or None
|
str or None
|
||||||
The most likely category, or None if no guess was made.
|
The most likely category, or None if no words matching any
|
||||||
|
category in the model were found.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
probs = self.confidence(text)
|
||||||
model = self.model
|
|
||||||
|
|
||||||
text = gptc.tokenizer.tokenize(text)
|
|
||||||
probs = {}
|
|
||||||
for word in text:
|
|
||||||
try:
|
|
||||||
for category, value in model[word].items():
|
|
||||||
try:
|
|
||||||
probs[category] += value
|
|
||||||
except KeyError:
|
|
||||||
probs[category] = value
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
try:
|
try:
|
||||||
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
|
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user