Add ngrams

First git commit from new laptop!
2022-07-13 11:45:17 -07:00 · 2022-07-13 11:45:17 -07:00 · ce80647bbb
commit ce80647bbb
parent c54c639b2f
7 changed files with 80 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -8,31 +8,34 @@ GPTC provides both a CLI tool and a Python library.

 ### Classifying text

-    python -m gptc classify <compiled model file>
+    python -m gptc classify [-n <max_ngram_length>] <compiled model file>

 This will prompt for a string and classify it, then print (in JSON) a dict of
-the format `{category: probability, category:probability, ...}` to stdout.
+the format `{category: probability, category:probability, ...}` to stdout. (For
+information about `-n <max_ngram_length>`, see section "Ngrams.")

 Alternatively, if you only need the most likely category, you can use this:

-    python -m gptc classify [-c|--category] <compiled model file>
+    python -m gptc classify [-n <max_ngram_length>] <-c|--category> <compiled model file>

 This will prompt for a string and classify it, outputting the category on
 stdout (or "None" if it cannot determine anything).

 ### Compiling models

-    python -m gptc compile <raw model file>
+    python -m gptc compile [-n <max_ngram_length>] <raw model file>

 This will print the compiled model in JSON to stdout.

 ## Library

-### `gptc.Classifier(model)`
+### `gptc.Classifier(model, max_ngram_length=1)`

 Create a `Classifier` object using the given *compiled* model (as a dict, not
 JSON).

+For information about `max_ngram_length`, see section "Ngrams."
+
 #### `Classifier.confidence(text)`

 Classify `text`. Returns a dict of the format `{category: probability,
@ -43,10 +46,32 @@ category:probability, ...}`
 Classify `text`. Returns the category into which the text is placed (as a
 string), or `None` when it cannot classify the text.

-### `gptc.compile(raw_model)`
+### `gptc.compile(raw_model, max_ngram_length=1)`
 Compile a raw model (as a list, not JSON) and return the compiled model (as a
 dict).

+For information about `max_ngram_length`, see section "Ngrams."
+
+## Ngrams
+
+GPTC optionally supports using ngrams to improve classification accuracy. They
+are disabled by default (maximum length set to 1) for performance and
+compatibility reasons. Enabling them significantly increases the time required
+both for compilation and classification. The effect seems more significant for
+compilation than for classification. Compiled models are also much larger when
+ngrams are enabled. Larger maximum ngram lengths will result in slower
+performance and larger files. It is a good idea to experiment with different
+values and use the highest one at which GPTC is fast enough and models are
+small enough for your needs.
+
+Once a model is compiled at a certain maximum ngram length, it cannot be used
+for classification with a higher value. If you instantiate a `Classifier` with
+a model compiled with a lower `max_ngram_length`, the value will be silently
+reduced to the one used when compiling the model.
+
+Models compiled with older versions of GPTC which did not support ngrams are
+handled the same way as models compiled with `max_ngram_length=1`.
+
 ## Model format

 This section explains the raw model format, which is how you should create and
@ -73,6 +98,8 @@ Mark Twain and those written by William Shakespeare, is available in `models`.
 The raw model is in `models/raw.json`; the compiled model is in
 `models/compiled.json`.

+The example model was compiled with `max_ngram_length=10`.
+
 ## Benchmark

 A benchmark script is available for comparing performance of GPTC between
--- a/benchmark.py
+++ b/benchmark.py
@ -3,6 +3,7 @@ import gptc
 import json
 import sys

+max_ngram_length = 10
 compile_iterations = 100
 classify_iterations = 10000

@ -12,9 +13,8 @@ with open("models/raw.json") as f:
 with open("models/benchmark_text.txt") as f:
    text = f.read()

-classifier = gptc.Classifier(gptc.compile(raw_model))
-
 print("Benchmarking GPTC on Python", sys.version)
+print("Maximum ngram length:", max_ngram_length)

 print(
    "Average compilation time over",
@ -23,7 +23,7 @@ print(
    round(
        1000000
        * timeit.timeit(
-            "gptc.compile(raw_model)",
+            "gptc.compile(raw_model, max_ngram_length)",
            number=compile_iterations,
            globals=globals(),
        )
@ -33,6 +33,7 @@ print(
 )


+classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
 print(
    "Average classification time over",
    classify_iterations,
@ -48,3 +49,4 @@ print(
    ),
    "microseconds",
 )
+print("--- benchmark complete ---")
--- a/gptc/main.py
+++ b/gptc/main.py
@ -6,14 +6,18 @@ import json
 import sys
 import gptc

-parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
+parser = argparse.ArgumentParser(
+    description="General Purpose Text Classifier", prog="gptc"
+)
 subparsers = parser.add_subparsers(dest="subparser_name", required=True)

-compile_parser = subparsers.add_parser('compile', help='compile a raw model')
+compile_parser = subparsers.add_parser("compile", help="compile a raw model")
 compile_parser.add_argument("model", help="raw model to compile")
+compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)

-classify_parser = subparsers.add_parser('classify', help='classify text')
+classify_parser = subparsers.add_parser("classify", help="classify text")
 classify_parser.add_argument("model", help="compiled model to use")
+classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
 group = classify_parser.add_mutually_exclusive_group()
 group.add_argument(
    "-j",
@ -33,10 +37,10 @@ args = parser.parse_args()
 with open(args.model, "r") as f:
    model = json.load(f)

-if args.subparser_name == 'compile':
-    print(json.dumps(gptc.compile(model)))
+if args.subparser_name == "compile":
+    print(json.dumps(gptc.compile(model, args.max_ngram_length)))
 else:
-    classifier = gptc.Classifier(model)
+    classifier = gptc.Classifier(model, args.max_ngram_length)

    if sys.stdin.isatty():
        text = input("Text to analyse: ")
--- a/gptc/classifier.py
+++ b/gptc/classifier.py
@ -12,6 +12,11 @@ class Classifier:
    model : dict
        A compiled GPTC model.

+    max_ngram_length : int
+        The maximum ngram length to use when tokenizing input. If this is
+        greater than the value used when the model was compiled, it will be
+        silently lowered to that value.
+
    Attributes
    ----------
    model : dict
@ -19,12 +24,15 @@ class Classifier:

    """

-    def __init__(self, model):
+    def __init__(self, model, max_ngram_length=1):
        if model.get("__version__", 0) != 3:
            raise gptc.exceptions.UnsupportedModelError(
                f"unsupported model version"
            )
        self.model = model
+        self.max_ngram_length = min(
+            max_ngram_length, model.get("__ngrams__", 1)
+        )

    def confidence(self, text):
        """Classify text with confidence.
@ -44,7 +52,7 @@ class Classifier:

        model = self.model

-        text = gptc.tokenizer.tokenize(text)
+        text = gptc.tokenizer.tokenize(text, self.max_ngram_length)
        probs = {}
        for word in text:
            try:
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@ -3,7 +3,7 @@
 import gptc.tokenizer


-def compile(raw_model):
+def compile(raw_model, max_ngram_length=1):
    """Compile a raw model.

    Parameters
@ -11,6 +11,9 @@ def compile(raw_model):
    raw_model : list of dict
        A raw GPTC model.

+    max_ngram_length : int
+        Maximum ngram lenght to compile with.
+
    Returns
    -------
    dict
@ -21,7 +24,7 @@ def compile(raw_model):
    categories = {}

    for portion in raw_model:
-        text = gptc.tokenizer.tokenize(portion["text"])
+        text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
        category = portion["category"]
        try:
            categories[category] += text
@ -64,7 +67,7 @@ def compile(raw_model):
            )

    model["__names__"] = names
-
+    model["__ngrams__"] = max_ngram_length
    model["__version__"] = 3

    return model
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@ -1,14 +1,23 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later


-def tokenize(text):
+def tokenize(text, max_ngram_length=1):
    """Convert a string to a list of lemmas."""
-    out = [""]
+    tokens = [""]

    for char in text.lower():
        if char.isalpha() or char == "'":
-            out[-1] += char
-        elif out[-1] != "":
-            out.append("")
+            tokens[-1] += char
+        elif tokens[-1] != "":
+            tokens.append("")

-    return [string for string in out if string]
+    tokens = [string for string in tokens if string]
+
+    if max_ngram_length == 1:
+        return tokens
+    else:
+        ngrams = []
+        for ngram_length in range(1, max_ngram_length + 1):
+            for index in range(len(tokens) + 1 - ngram_length):
+                ngrams.append(" ".join(tokens[index : index + ngram_length]))
+        return ngrams
--- a/models/compiled.json
+++ b/models/compiled.json