Reorganize code and improve README

2021-10-26 13:33:15 -07:00 · 2021-10-26 13:33:15 -07:00 · 91e35a0d61
commit 91e35a0d61
parent e1267b62b0
6 changed files with 134 additions and 135 deletions
--- a/README.md
+++ b/README.md
@ -1,29 +1,33 @@
 # GPTC
 General-purpose text classifier in Python
 GPTC provides both a CLI tool and a Python library.
 ## CLI Tool
-If you just want to do some simple classification on the command line, use the
+### Classifying text
-CLI tool. To use an existing model, <!-- When initialising a Classifier
+
-object, pass in the keyword argument `supress_uncompiled_model_warning=True`.
+    `python -m gptc <modelfile>`
-->use `gptc <modelfile>`. It will prompt for a string, and classify it,
+
-outputting the category on stdout (or "unknown" if it cannot determine
+This will prompt for a string and classify it, outputting the category on
-anything) See "Model format" for a description of the model. To compile a
+stdout (or "None" if it cannot determine
-model, use `gptc <rawmodelfile> -c|--compile <compiledmodelfile>`.
+anything).
 ### Compiling models
    gptc <raw model file> -c|--compile <compiled model file>
 ## Library
 If you want to use GPTC programmatically, use the library.
 ### `gptc.Classifier(model)`
-Create a `Classifier` object using the given model (as a Python list/dict, not
+Create a `Classifier` object using the given *compiled* model (as a dict, not
-as JSON). If the model is raw (a list), it will print a big warning on stderr.
+JSON).
-### `Classifier.classify(text)`
+#### `Classifier.classify(text)`
 Classify `text` with GPTC using the model used to instantiate the
 `Classifier`. Returns the category into which the text is placed (as a
 string), or `None` when it cannot classify the text.
 ## `gptc.compile(raw_model)`
 Compile a raw model (as a list, not JSON) and return the compiled model (as a
 dict).
 ## Model format
 Since you never really need to mess with compiled models, I won't discuss
 them. You can read the code if you really need to figure them out.
 This section explains the raw model format, which is how you should create and
 edit models.
@ -36,16 +40,13 @@ Raw models are formatted as a list of dicts. See below for the format:
        }
    ]
-Although GPTC handles models as Python lists (for raw models) or dicts (for
+GPTC handles models as Python `list`s of `dict`s of `str`s (for raw models) or
-compiled models), I recommend storing them in JSON format, mainly because the
+`dict`s of `str`s and `float`s (for compiled models), and they can be stored
-command-line tool uses JSON.
+in any way these Python objects can be. However, it is recommended to store
-
+them in JSON format for compatibility with the command-line tool.
 ## Example models
 I provide an example model trained to distinguish between texts written by
 Mark Twain and those written by William Shakespeare. I chose them because
 their works have all gone into the public domain, and their writing style is
 so different that GPTC can easily tell the difference, making it a good
 demonstration.
 ## Example model
 An example model, which is designed to distinguish between texts written by
 Mark Twain and those written by William Shakespeare, is available in `models`.
 The raw model is in `models/raw.json`; the compiled model is in
 `models/compiled.json`.
--- a/gptc/init.py
+++ b/gptc/init.py
@ -1,3 +1,4 @@
 """General-Purpose Text Classifier"""
-from gptc.gptc import compile, Classifier
+from gptc.compiler import compile
 from gptc.classifier import Classifier
--- a/gptc/classifier.py
+++ b/gptc/classifier.py
@ -0,0 +1,51 @@
 import gptc.tokenizer
 class Classifier:
    """A text classifier.
    Parameters
    ----------
    model : dict
        A compiled GPTC model.
    Attributes
    ----------
    model : dict
        The model used.
    """
    def __init__(self, model):
        self.model = model
    def classify(self, text):
        """Classify text.
        Parameters
        ----------
        text : str
            The text to classify
        Returns
        -------
        str or None
            The most likely category, or None if no guess was made.
        """
        model = self.model
        text = gptc.tokenizer.tokenize(text)
        probs = {}
        for word in text:
            try:
                for category, value in model[word].items():
                    try:
                        probs[category] += value
                    except KeyError:
                        probs[category] = value
            except KeyError:
                pass
        try:
            return sorted(probs.items(), key=lambda x: x[1])[-1][0]
        except IndexError:
            return None
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@ -0,0 +1,45 @@
 import gptc.tokenizer
 def compile(raw_model):
    """Compile a raw model.
    Parameters
    ----------
    raw_model : list of dict
        A raw GPTC model.
    Returns
    -------
    dict
        A compiled GPTC model.
    """
    categories = {}
    for portion in raw_model:
        text = gptc.tokenizer.tokenize(portion['text'])
        category = portion['category']
        try:
            categories[category] += text
        except KeyError:
            categories[category] = text
    categories_by_count = {}
    for category, text in categories.items():
        categories_by_count[category] = {}
        for word in text:
            try:
                categories_by_count[category][word] += 1/len(categories[category])
            except KeyError:
                categories_by_count[category][word] = 1/len(categories[category])
    word_weights = {}
    for category, words in categories_by_count.items():
        for word, value in words.items():
            try:
                word_weights[word][category] = value
            except KeyError:
                word_weights[word] = {category:value}
    return word_weights
--- a/gptc/gptc.py
+++ b/gptc/gptc.py
@ -1,110 +0,0 @@
 '''Main module for GPTC.'''
 import sys
 def _listify(text):
    """Convert a string to a list of lemmas."""
    out = [""]
    for char in text.lower():
        if char.isalpha() or char == "'":
            out[-1] += char
        elif out[-1] != "":
            out.append("")
    return [string for string in out if string]
 def compile(raw_model):
    """Compile a raw model.
    Parameters
    ----------
    raw_model : list of dict
        A raw GPTC model.
    Returns
    -------
    dict
        A compiled GPTC model.
    """
    categories = {}
    for portion in raw_model:
        text = _listify(portion['text'])
        category = portion['category']
        try:
            categories[category] += text
        except KeyError:
            categories[category] = text
    categories_by_count = {}
    for category, text in categories.items():
        categories_by_count[category] = {}
        for word in text:
            try:
                categories_by_count[category][word] += 1/len(categories[category])
            except KeyError:
                categories_by_count[category][word] = 1/len(categories[category])
    word_weights = {}
    for category, words in categories_by_count.items():
        for word, value in words.items():
            try:
                word_weights[word][category] = value
            except KeyError:
                word_weights[word] = {category:value}
    return word_weights
 class Classifier:
    """A text classifier.
    Parameters
    ----------
    model : dict
        A compiled GPTC model.
    Attributes
    ----------
    model : dict
        The model used.
    """
    def __init__(self, model):
        self.model = model
    def classify(self, text):
        """Classify text.
        Parameters
        ----------
        text : str
            The text to classify
        Returns
        -------
        str or None
            The most likely category, or None if no guess was made.
        """
        model = self.model
        text = _listify(text)
        probs = {}
        for word in text:
            try:
                for category, value in model[word].items():
                    try:
                        probs[category] += value
                    except KeyError:
                        probs[category] = value
            except KeyError:
                pass
        try:
            return sorted(probs.items(), key=lambda x: x[1])[-1][0]
        except IndexError:
            return None
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@ -0,0 +1,11 @@
 def tokenize(text):
    """Convert a string to a list of lemmas."""
    out = [""]
    for char in text.lower():
        if char.isalpha() or char == "'":
            out[-1] += char
        elif out[-1] != "":
            out.append("")
    return [string for string in out if string]