Add Classifier.confidence()

2021-11-03 06:38:22 -07:00 · 2021-11-03 06:38:22 -07:00 · be543134bc
commit be543134bc
parent ac8ef8db89
3 changed files with 55 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -9,8 +9,15 @@ GPTC provides both a CLI tool and a Python library.
    `python -m gptc <modelfile>`
 This will prompt for a string and classify it, outputting the category on
-stdout (or "None" if it cannot determine
+stdout (or "None" if it cannot determine anything).
-anything).
+
 Alternatively, if you need confidence data, use:
    `python -m gptc -j <modelfile>`
 This will print (in JSON) a dict of the format `{category: probability,
 category:probability, ...}` to stdout.
 ### Compiling models
    gptc <raw model file> -c|--compile <compiled model file>
@ -19,9 +26,11 @@ anything).
 ### `gptc.Classifier(model)`
 Create a `Classifier` object using the given *compiled* model (as a dict, not
 JSON).
 #### `Classifier.confidence(text)`
 Classify `text`. Returns a dict of the format `{category: probability,
 category:probability, ...}`
 #### `Classifier.classify(text)`
-Classify `text` with GPTC using the model used to instantiate the
+Classify `text`. Returns the category into which the text is placed (as a
 `Classifier`. Returns the category into which the text is placed (as a
 string), or `None` when it cannot classify the text.
 ## `gptc.compile(raw_model)`
 Compile a raw model (as a list, not JSON) and return the compiled model (as a
--- a/gptc/main.py
+++ b/gptc/main.py
@ -7,6 +7,7 @@ import gptc
 parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
 parser.add_argument('model', help='model to use')
 parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
 parser.add_argument('-j', '--confidence', help='output confidence dict in json', action='store_true')
 args = parser.parse_args()
 with open(args.model, 'r') as f:
@ -20,4 +21,7 @@ else:
        text = input('Text to analyse: ')
    else:
        text = sys.stdin.read()
    if args.confidence:
        print(json.dumps(classifier.confidence(text)))
    else:
        print(classifier.classify(text))
--- a/gptc/classifier.py
+++ b/gptc/classifier.py
@ -34,6 +34,40 @@ class Classifier:
            warnings.warn("model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future")
            self.model = gptc.compiler.compile(raw_model)
    def confidence(self, text):
        """Classify text with confidence.
        Parameters
        ----------
        text : str
            The text to classify
        Returns
        -------
        dict
            {category:probability, category:probability...} or {} if no words
            matching any categories in the model were found
        """
        model = self.model
        text = gptc.tokenizer.tokenize(text)
        probs = {}
        for word in text:
            try:
                total = sum(model[word].values())
                for category, value in model[word].items():
                    try:
                        probs[category] += value / total
                    except KeyError:
                        probs[category] = value / total
            except KeyError:
                pass
        total = sum(probs.values())
        probs = {category: value/total for category, value in probs.items()}
        return probs
    def classify(self, text):
        """Classify text.
@ -45,23 +79,11 @@ class Classifier:
        Returns
        -------
        str or None
-            The most likely category, or None if no guess was made.
+            The most likely category, or None if no words matching any
            category in the model were found.
        """
-
+        probs = self.confidence(text)
        model = self.model
        text = gptc.tokenizer.tokenize(text)
        probs = {}
        for word in text:
            try:
                for category, value in model[word].items():
                    try:
                        probs[category] += value
                    except KeyError:
                        probs[category] = value
            except KeyError:
                pass
        try:
            return sorted(probs.items(), key=lambda x: x[1])[-1][0]
        except IndexError: