6 changed files with 70 additions and 100 deletions
--- a/README.md
+++ b/README.md
@ -1,54 +1,42 @@
 # GPTC
 General-purpose text classifier in Python
 GPTC provides both a CLI tool and a Python library.
 ## CLI Tool
 ### Classifying text
-    python -m gptc classify <compiled model file>
+    python -m gptc <modelfile>
 This will prompt for a string and classify it, then print (in JSON) a dict of
 the format `{category: probability, category:probability, ...}` to stdout.
 Alternatively, if you only need the most likely category, you can use this:
    python -m gptc classify [-c|--category] <compiled model file>
 This will prompt for a string and classify it, outputting the category on
 stdout (or "None" if it cannot determine anything).
 Alternatively, if you need confidence data, use:
    python -m gptc -j <modelfile>
 This will print (in JSON) a dict of the format `{category: probability,
 category:probability, ...}` to stdout.
 ### Compiling models
-    python -m gptc compile <raw model file>
+    python -m gptc <raw model file> -c|--compile <compiled model file>
 This will print the compiled model in JSON to stdout.
 ## Library
 ### `gptc.Classifier(model)`
 Create a `Classifier` object using the given *compiled* model (as a dict, not
 JSON).
 #### `Classifier.confidence(text)`
 Classify `text`. Returns a dict of the format `{category: probability,
 category:probability, ...}`
 #### `Classifier.classify(text)`
 Classify `text`. Returns the category into which the text is placed (as a
 string), or `None` when it cannot classify the text.
-
+## `gptc.compile(raw_model)`
 ### `gptc.compile(raw_model)`
 Compile a raw model (as a list, not JSON) and return the compiled model (as a
 dict).
 ## Model format
 This section explains the raw model format, which is how you should create and
 edit models.
@ -67,7 +55,6 @@ in any way these Python objects can be. However, it is recommended to store
 them in JSON format for compatibility with the command-line tool.
 ## Example model
 An example model, which is designed to distinguish between texts written by
 Mark Twain and those written by William Shakespeare, is available in `models`.
 The raw model is in `models/raw.json`; the compiled model is in
--- a/gptc/main.py
+++ b/gptc/main.py
@ -6,44 +6,28 @@ import json
 import sys
 import gptc
-parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
+parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
-subparsers = parser.add_subparsers(dest="subparser_name", required=True)
+parser.add_argument("model", help="model to use")
-
+parser.add_argument(
-compile_parser = subparsers.add_parser('compile', help='compile a raw model')
+    "-c", "--compile", help="compile raw model model to outfile", metavar="outfile"
 compile_parser.add_argument("model", help="raw model to compile")
 classify_parser = subparsers.add_parser('classify', help='classify text')
 classify_parser.add_argument("model", help="compiled model to use")
 group = classify_parser.add_mutually_exclusive_group()
 group.add_argument(
    "-j",
    "--json",
    help="output confidence dict as JSON (default)",
    action="store_true",
 )
-group.add_argument(
+parser.add_argument(
-    "-c",
+    "-j", "--confidence", help="output confidence dict in json", action="store_true"
    "--category",
    help="output most likely category or `None`",
    action="store_true",
 )
 args = parser.parse_args()
 with open(args.model, "r") as f:
-    model = json.load(f)
+    raw_model = json.load(f)
-
+if args.compile:
-if args.subparser_name == 'compile':
+    with open(args.compile, "w+") as f:
-    print(json.dumps(gptc.compile(model)))
+        json.dump(gptc.compile(raw_model), f)
 else:
-    classifier = gptc.Classifier(model)
+    classifier = gptc.Classifier(raw_model)
    if sys.stdin.isatty():
        text = input("Text to analyse: ")
    else:
        text = sys.stdin.read()
-
+    if args.confidence:
    if args.category:
        print(classifier.classify(text))
    else:
        print(json.dumps(classifier.confidence(text)))
    else:
        print(classifier.classify(text))
--- a/gptc/classifier.py
+++ b/gptc/classifier.py
@ -20,11 +20,26 @@ class Classifier:
    """
    def __init__(self, model):
-        if model.get("__version__", 0) != 3:
+        try:
-            raise gptc.exceptions.UnsupportedModelError(
+            model_version = model["__version__"]
-                f"unsupported model version"
+        except:
            model_version = 1
        if model_version == 3:
            self.model = model
        else:
            # The model is an unsupported version
            try:
                raw_model = model["__raw__"]
            except:
                raise gptc.exceptions.UnsupportedModelError(
                    "this model is unsupported and does not contain a raw model for recompiling"
                )
            warnings.warn(
                "model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future"
            )
-        self.model = model
+            self.model = gptc.compiler.compile(raw_model)
    def confidence(self, text):
        """Classify text with confidence.
@ -59,8 +74,7 @@ class Classifier:
            except KeyError:
                pass
        probs = {
-            model["__names__"][category]: value
+            model["__names__"][category]: value for category, value in probs.items()
            for category, value in probs.items()
        }
        total = sum(probs.values())
        probs = {category: value / total for category, value in probs.items()}
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@ -39,13 +39,9 @@ def compile(raw_model):
        categories_by_count[category] = {}
        for word in text:
            try:
-                categories_by_count[category][word] += 1 / len(
+                categories_by_count[category][word] += 1 / len(categories[category])
                    categories[category]
                )
            except KeyError:
-                categories_by_count[category][word] = 1 / len(
+                categories_by_count[category][word] = 1 / len(categories[category])
                    categories[category]
                )
    word_weights = {}
    for category, words in categories_by_count.items():
        for word, value in words.items():
@ -59,12 +55,11 @@ def compile(raw_model):
        total = sum(weights.values())
        model[word] = []
        for category in names:
-            model[word].append(
+            model[word].append(round((weights.get(category, 0) / total) * 65535))
                round((weights.get(category, 0) / total) * 65535)
            )
    model["__names__"] = names
    model["__version__"] = 3
    model["__raw__"] = raw_model
    return model
--- a/models/compiled.json
+++ b/models/compiled.json
--- a/utils/pack.py
+++ b/utils/pack.py
@ -4,38 +4,28 @@ import sys
 import os
 import json
 def pack(directory, print_exceptions=True):
    paths = os.listdir(directory)
    texts = {}
    exceptions = []
    for path in paths:
        texts[path] = []
        try:
            for file in os.listdir(os.path.join(sys.argv[1], path)):
                try:
                    with open(os.path.join(sys.argv[1], path, file)) as f:
                        texts[path].append(f.read())
                except Exception as e:
                    exceptions.append((e,))
                    if print_exceptions:
                        print(e, file=sys.stderr)
        except Exception as e:
            exceptions.append((e,))
            if print_exceptions:
                print(e, file=sys.stderr)
    raw_model = []
    for category, cat_texts in texts.items():
        raw_model += [{"category": category, "text": i} for i in cat_texts]
    return raw_model, exceptions
 if len(sys.argv) != 2:
    print("usage: pack.py <path>", file=sys.stderr)
    exit(1)
-print(json.dumps(pack(sys.argv[1])[0]))
+paths = os.listdir(sys.argv[1])
 texts = {}
 for path in paths:
    texts[path] = []
    try:
        for file in os.listdir(os.path.join(sys.argv[1], path)):
            try:
                with open(os.path.join(sys.argv[1], path, file)) as f:
                    texts[path].append(f.read())
            except Exception as e:
                print(e, file=sys.stderr)
    except Exception as e:
        print(e, file=sys.stderr)
 raw_model = []
 for category, cat_texts in texts.items():
    raw_model += [{"category": category, "text": i} for i in cat_texts]
 print(json.dumps(raw_model))