Merge pull request #3 from kj7rrv/algo2

Add Classifier.confidence()
2021-11-05 18:39:59 -07:00 · 2021-11-05 18:39:59 -07:00 · b3d975f3d1
commit b3d975f3d1
parent ac8ef8db89 be543134bc
3 changed files with 55 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -9,8 +9,15 @@ GPTC provides both a CLI tool and a Python library.
    `python -m gptc <modelfile>`

 This will prompt for a string and classify it, outputting the category on
-stdout (or "None" if it cannot determine
-anything).
+stdout (or "None" if it cannot determine anything).
+
+Alternatively, if you need confidence data, use:
+
+    `python -m gptc -j <modelfile>`
+
+This will print (in JSON) a dict of the format `{category: probability,
+category:probability, ...}` to stdout.
+
 ### Compiling models

    gptc <raw model file> -c|--compile <compiled model file>
@ -19,9 +26,11 @@ anything).
 ### `gptc.Classifier(model)`
 Create a `Classifier` object using the given *compiled* model (as a dict, not
 JSON).
+#### `Classifier.confidence(text)`
+Classify `text`. Returns a dict of the format `{category: probability,
+category:probability, ...}`
 #### `Classifier.classify(text)`
-Classify `text` with GPTC using the model used to instantiate the
-`Classifier`. Returns the category into which the text is placed (as a
+Classify `text`. Returns the category into which the text is placed (as a
 string), or `None` when it cannot classify the text.
 ## `gptc.compile(raw_model)`
 Compile a raw model (as a list, not JSON) and return the compiled model (as a
--- a/gptc/main.py
+++ b/gptc/main.py
@ -7,6 +7,7 @@ import gptc
 parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
 parser.add_argument('model', help='model to use')
 parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
+parser.add_argument('-j', '--confidence', help='output confidence dict in json', action='store_true')
 args = parser.parse_args()

 with open(args.model, 'r') as f:
@ -20,4 +21,7 @@ else:
        text = input('Text to analyse: ')
    else:
        text = sys.stdin.read()
+    if args.confidence:
+        print(json.dumps(classifier.confidence(text)))
+    else:
        print(classifier.classify(text))
--- a/gptc/classifier.py
+++ b/gptc/classifier.py
@ -34,6 +34,40 @@ class Classifier:
            warnings.warn("model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future")
            self.model = gptc.compiler.compile(raw_model)

+    def confidence(self, text):
+        """Classify text with confidence.
+
+        Parameters
+        ----------
+        text : str
+            The text to classify
+
+        Returns
+        -------
+        dict
+            {category:probability, category:probability...} or {} if no words
+            matching any categories in the model were found
+
+        """
+
+        model = self.model
+
+        text = gptc.tokenizer.tokenize(text)
+        probs = {}
+        for word in text:
+            try:
+                total = sum(model[word].values())
+                for category, value in model[word].items():
+                    try:
+                        probs[category] += value / total
+                    except KeyError:
+                        probs[category] = value / total
+            except KeyError:
+                pass
+        total = sum(probs.values())
+        probs = {category: value/total for category, value in probs.items()}
+        return probs
+
    def classify(self, text):
        """Classify text.

@ -45,23 +79,11 @@ class Classifier:
        Returns
        -------
        str or None
-            The most likely category, or None if no guess was made.
+            The most likely category, or None if no words matching any
+            category in the model were found.

        """
-
-        model = self.model
-
-        text = gptc.tokenizer.tokenize(text)
-        probs = {}
-        for word in text:
-            try:
-                for category, value in model[word].items():
-                    try:
-                        probs[category] += value
-                    except KeyError:
-                        probs[category] = value
-            except KeyError:
-                pass
+        probs = self.confidence(text)
        try:
            return sorted(probs.items(), key=lambda x: x[1])[-1][0]
        except IndexError: