Add ngrams

First git commit from new laptop!
2022-07-13 11:45:17 -07:00 · 2022-07-13 11:45:17 -07:00 · ce80647bbb
commit ce80647bbb
parent c54c639b2f
7 changed files with 80 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -8,31 +8,34 @@ GPTC provides both a CLI tool and a Python library.
 ### Classifying text
-    python -m gptc classify <compiled model file>
+    python -m gptc classify [-n <max_ngram_length>] <compiled model file>
 This will prompt for a string and classify it, then print (in JSON) a dict of
-the format `{category: probability, category:probability, ...}` to stdout.
+the format `{category: probability, category:probability, ...}` to stdout. (For
 information about `-n <max_ngram_length>`, see section "Ngrams.")
 Alternatively, if you only need the most likely category, you can use this:
-    python -m gptc classify [-c|--category] <compiled model file>
+    python -m gptc classify [-n <max_ngram_length>] <-c|--category> <compiled model file>
 This will prompt for a string and classify it, outputting the category on
 stdout (or "None" if it cannot determine anything).
 ### Compiling models
-    python -m gptc compile <raw model file>
+    python -m gptc compile [-n <max_ngram_length>] <raw model file>
 This will print the compiled model in JSON to stdout.
 ## Library
-### `gptc.Classifier(model)`
+### `gptc.Classifier(model, max_ngram_length=1)`
 Create a `Classifier` object using the given *compiled* model (as a dict, not
 JSON).
 For information about `max_ngram_length`, see section "Ngrams."
 #### `Classifier.confidence(text)`
 Classify `text`. Returns a dict of the format `{category: probability,
@ -43,10 +46,32 @@ category:probability, ...}`
 Classify `text`. Returns the category into which the text is placed (as a
 string), or `None` when it cannot classify the text.
-### `gptc.compile(raw_model)`
+### `gptc.compile(raw_model, max_ngram_length=1)`
 Compile a raw model (as a list, not JSON) and return the compiled model (as a
 dict).
 For information about `max_ngram_length`, see section "Ngrams."
 ## Ngrams
 GPTC optionally supports using ngrams to improve classification accuracy. They
 are disabled by default (maximum length set to 1) for performance and
 compatibility reasons. Enabling them significantly increases the time required
 both for compilation and classification. The effect seems more significant for
 compilation than for classification. Compiled models are also much larger when
 ngrams are enabled. Larger maximum ngram lengths will result in slower
 performance and larger files. It is a good idea to experiment with different
 values and use the highest one at which GPTC is fast enough and models are
 small enough for your needs.
 Once a model is compiled at a certain maximum ngram length, it cannot be used
 for classification with a higher value. If you instantiate a `Classifier` with
 a model compiled with a lower `max_ngram_length`, the value will be silently
 reduced to the one used when compiling the model.
 Models compiled with older versions of GPTC which did not support ngrams are
 handled the same way as models compiled with `max_ngram_length=1`.
 ## Model format
 This section explains the raw model format, which is how you should create and
@ -73,6 +98,8 @@ Mark Twain and those written by William Shakespeare, is available in `models`.
 The raw model is in `models/raw.json`; the compiled model is in
 `models/compiled.json`.
 The example model was compiled with `max_ngram_length=10`.
 ## Benchmark
 A benchmark script is available for comparing performance of GPTC between
--- a/benchmark.py
+++ b/benchmark.py
@ -3,6 +3,7 @@ import gptc
 import json
 import sys
 max_ngram_length = 10
 compile_iterations = 100
 classify_iterations = 10000
@ -12,9 +13,8 @@ with open("models/raw.json") as f:
 with open("models/benchmark_text.txt") as f:
    text = f.read()
 classifier = gptc.Classifier(gptc.compile(raw_model))
 print("Benchmarking GPTC on Python", sys.version)
 print("Maximum ngram length:", max_ngram_length)
 print(
    "Average compilation time over",
@ -23,7 +23,7 @@ print(
    round(
        1000000
        * timeit.timeit(
-            "gptc.compile(raw_model)",
+            "gptc.compile(raw_model, max_ngram_length)",
            number=compile_iterations,
            globals=globals(),
        )
@ -33,6 +33,7 @@ print(
 )
 classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
 print(
    "Average classification time over",
    classify_iterations,
@ -48,3 +49,4 @@ print(
    ),
    "microseconds",
 )
 print("--- benchmark complete ---")
--- a/gptc/main.py
+++ b/gptc/main.py
@ -6,14 +6,18 @@ import json
 import sys
 import gptc
-parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
+parser = argparse.ArgumentParser(
    description="General Purpose Text Classifier", prog="gptc"
 )
 subparsers = parser.add_subparsers(dest="subparser_name", required=True)
-compile_parser = subparsers.add_parser('compile', help='compile a raw model')
+compile_parser = subparsers.add_parser("compile", help="compile a raw model")
 compile_parser.add_argument("model", help="raw model to compile")
 compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
-classify_parser = subparsers.add_parser('classify', help='classify text')
+classify_parser = subparsers.add_parser("classify", help="classify text")
 classify_parser.add_argument("model", help="compiled model to use")
 classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
 group = classify_parser.add_mutually_exclusive_group()
 group.add_argument(
    "-j",
@ -33,10 +37,10 @@ args = parser.parse_args()
 with open(args.model, "r") as f:
    model = json.load(f)
-if args.subparser_name == 'compile':
+if args.subparser_name == "compile":
-    print(json.dumps(gptc.compile(model)))
+    print(json.dumps(gptc.compile(model, args.max_ngram_length)))
 else:
-    classifier = gptc.Classifier(model)
+    classifier = gptc.Classifier(model, args.max_ngram_length)
    if sys.stdin.isatty():
        text = input("Text to analyse: ")
--- a/gptc/classifier.py
+++ b/gptc/classifier.py
@ -12,6 +12,11 @@ class Classifier:
    model : dict
        A compiled GPTC model.
    max_ngram_length : int
        The maximum ngram length to use when tokenizing input. If this is
        greater than the value used when the model was compiled, it will be
        silently lowered to that value.
    Attributes
    ----------
    model : dict
@ -19,12 +24,15 @@ class Classifier:
    """
-    def __init__(self, model):
+    def __init__(self, model, max_ngram_length=1):
        if model.get("__version__", 0) != 3:
            raise gptc.exceptions.UnsupportedModelError(
                f"unsupported model version"
            )
        self.model = model
        self.max_ngram_length = min(
            max_ngram_length, model.get("__ngrams__", 1)
        )
    def confidence(self, text):
        """Classify text with confidence.
@ -44,7 +52,7 @@ class Classifier:
        model = self.model
-        text = gptc.tokenizer.tokenize(text)
+        text = gptc.tokenizer.tokenize(text, self.max_ngram_length)
        probs = {}
        for word in text:
            try:
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@ -3,7 +3,7 @@
 import gptc.tokenizer
-def compile(raw_model):
+def compile(raw_model, max_ngram_length=1):
    """Compile a raw model.
    Parameters
@ -11,6 +11,9 @@ def compile(raw_model):
    raw_model : list of dict
        A raw GPTC model.
    max_ngram_length : int
        Maximum ngram lenght to compile with.
    Returns
    -------
    dict
@ -21,7 +24,7 @@ def compile(raw_model):
    categories = {}
    for portion in raw_model:
-        text = gptc.tokenizer.tokenize(portion["text"])
+        text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
        category = portion["category"]
        try:
            categories[category] += text
@ -64,7 +67,7 @@ def compile(raw_model):
            )
    model["__names__"] = names
-
+    model["__ngrams__"] = max_ngram_length
    model["__version__"] = 3
    return model
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@ -1,14 +1,23 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-def tokenize(text):
+def tokenize(text, max_ngram_length=1):
    """Convert a string to a list of lemmas."""
-    out = [""]
+    tokens = [""]
    for char in text.lower():
        if char.isalpha() or char == "'":
-            out[-1] += char
+            tokens[-1] += char
-        elif out[-1] != "":
+        elif tokens[-1] != "":
-            out.append("")
+            tokens.append("")
-    return [string for string in out if string]
+    tokens = [string for string in tokens if string]
    if max_ngram_length == 1:
        return tokens
    else:
        ngrams = []
        for ngram_length in range(1, max_ngram_length + 1):
            for index in range(len(tokens) + 1 - ngram_length):
                ngrams.append(" ".join(tokens[index : index + ngram_length]))
        return ngrams
--- a/models/compiled.json
+++ b/models/compiled.json