Make more of compilation numeric

2023-01-04 19:07:58 -08:00
6 changed files with 141 additions and 80 deletions
--- a/README.md
+++ b/README.md
@ -112,6 +112,13 @@ GPTC.
 See `models/unpacked/` for an example of the format.
 ### `gptc.Classifier(model, max_ngram_length=1)`
 `Classifier` objects are deprecated starting with GPTC 3.1.0, and will be
 removed in 4.0.0. See [the README from
 3.0.2](https://git.kj7rrv.com/kj7rrv/gptc/src/tag/v3.0.1/README.md) if you need
 documentation.
 ## Ngrams
 GPTC optionally supports using ngrams to improve classification accuracy. They
--- a/gptc/init.py
+++ b/gptc/init.py
@ -3,6 +3,7 @@
 """General-Purpose Text Classifier"""
 from gptc.compiler import compile as compile
 from gptc.classifier import Classifier as Classifier
 from gptc.pack import pack as pack
 from gptc.model import Model as Model, deserialize as deserialize
 from gptc.tokenizer import normalize as normalize
--- a/gptc/main.py
+++ b/gptc/main.py
@ -44,6 +44,19 @@ def main() -> None:
        type=int,
        default=1,
    )
    group = classify_parser.add_mutually_exclusive_group()
    group.add_argument(
        "-j",
        "--json",
        help="output confidence dict as JSON (default)",
        action="store_true",
    )
    group.add_argument(
        "-c",
        "--category",
        help="output most likely category or `None`",
        action="store_true",
    )
    check_parser = subparsers.add_parser(
        "check", help="check one word or ngram in model"
@ -75,7 +88,12 @@ def main() -> None:
        else:
            text = sys.stdin.read()
-        print(json.dumps(model.confidence(text, args.max_ngram_length)))
+        if args.category:
            classifier = gptc.Classifier(model, args.max_ngram_length)
            print(classifier.classify(text))
        else:
            probabilities = model.confidence(text, args.max_ngram_length)
            print(json.dumps(probabilities))
    elif args.subparser_name == "check":
        with open(args.model, "rb") as f:
            model = gptc.deserialize(f)
--- a/gptc/classifier.py
+++ b/gptc/classifier.py
@ -0,0 +1,68 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 import gptc.model
 from typing import Dict, Union
 class Classifier:
    """A text classifier.
    Parameters
    ----------
    model : dict
        A compiled GPTC model.
    max_ngram_length : int
        The maximum ngram length to use when tokenizing input. If this is
        greater than the value used when the model was compiled, it will be
        silently lowered to that value.
    Attributes
    ----------
    model : dict
        The model used.
    """
    def __init__(self, model: gptc.model.Model, max_ngram_length: int = 1):
        self.model = model
        model_ngrams = model.max_ngram_length
        self.max_ngram_length = min(max_ngram_length, model_ngrams)
    def confidence(self, text: str) -> Dict[str, float]:
        """Classify text with confidence.
        Parameters
        ----------
        text : str
            The text to classify
        Returns
        -------
        dict
            {category:probability, category:probability...} or {} if no words
            matching any categories in the model were found
        """
        return self.model.confidence(text, self.max_ngram_length)
    def classify(self, text: str) -> Union[str, None]:
        """Classify text.
        Parameters
        ----------
        text : str
            The text to classify
        Returns
        -------
        str or None
            The most likely category, or None if no words matching any
            category in the model were found.
        """
        probs: Dict[str, float] = self.confidence(text)
        try:
            return sorted(probs.items(), key=lambda x: x[1])[-1][0]
        except IndexError:
            return None
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@ -2,65 +2,7 @@
 import gptc.tokenizer
 import gptc.model
-from typing import Iterable, Mapping, List, Dict, Union, Tuple
+from typing import Iterable, Mapping, List, Dict, Union
 def _count_words(
    raw_model: Iterable[Mapping[str, str]],
    max_ngram_length: int,
    hash_algorithm: str,
 ) -> Tuple[Dict[int, Dict[str, int]], Dict[str, int], List[str]]:
    word_counts: Dict[int, Dict[str, int]] = {}
    category_lengths: Dict[str, int] = {}
    names: List[str] = []
    for portion in raw_model:
        text = gptc.tokenizer.hash(
            gptc.tokenizer.tokenize(portion["text"], max_ngram_length),
            hash_algorithm,
        )
        category = portion["category"]
        if not category in names:
            names.append(category)
        category_lengths[category] = category_lengths.get(category, 0) + len(
            text
        )
        for word in text:
            if word in word_counts:
                try:
                    word_counts[word][category] += 1
                except KeyError:
                    word_counts[word][category] = 1
            else:
                word_counts[word] = {category: 1}
    return word_counts, category_lengths, names
 def _get_weights(
    min_count: int,
    word_counts: Dict[int, Dict[str, int]],
    category_lengths: Dict[str, int],
    names: List[str],
 ) -> Dict[int, List[int]]:
    model: Dict[int, List[int]] = {}
    for word, counts in word_counts.items():
        if sum(counts.values()) >= min_count:
            weights = {
                category: value / category_lengths[category]
                for category, value in counts.items()
            }
            total = sum(weights.values())
            new_weights: List[int] = []
            for category in names:
                new_weights.append(
                    round((weights.get(category, 0) / total) * 65535)
                )
            model[word] = new_weights
    return model
 def compile(
@ -85,8 +27,49 @@ def compile(
        A compiled GPTC model.
    """
-    word_counts, category_lengths, names = _count_words(
+
-        raw_model, max_ngram_length, hash_algorithm
+    word_counts: Dict[int, Dict[int, int]] = {}
-    )
+    category_lengths: Dict[int, int] = {}
-    model = _get_weights(min_count, word_counts, category_lengths, names)
+    names: List[str] = []
    for portion in raw_model:
        text = gptc.tokenizer.hash(
            gptc.tokenizer.tokenize(portion["text"], max_ngram_length),
            hash_algorithm,
        )
        category_name = portion["category"]
        if not category_name in names:
            names.append(category_name)
        category = names.index(category_name)
        category_lengths[category] = category_lengths.get(category, 0) + len(
            text
        )
        for word in text:
            if word in word_counts:
                try:
                    word_counts[word][category] += 1
                except KeyError:
                    word_counts[word][category] = 1
            else:
                word_counts[word] = {category: 1}
    model: Dict[int, List[int]] = {}
    for word, counts in word_counts.items():
        if sum(counts.values()) >= min_count:
            weights = {
                category: value / category_lengths[category]
                for category, value in counts.items()
            }
            total = sum(weights.values())
            new_weights: List[int] = []
            for category in range(len(names)):
                new_weights.append(
                    round((weights.get(category, 0) / total) * 65535)
                )
            model[word] = new_weights
    return gptc.model.Model(model, names, max_ngram_length, hash_algorithm)
--- a/profiler.py
+++ b/profiler.py
@ -1,16 +0,0 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 import cProfile
 import gptc
 import json
 import sys
 max_ngram_length = 10
 with open("models/raw.json") as f:
    raw_model = json.load(f)
 with open("models/benchmark_text.txt") as f:
    text = f.read()
 cProfile.run("gptc.compile(raw_model, max_ngram_length)")