From 91e35a0d6176b93323afa18b5484677fc459c2e1 Mon Sep 17 00:00:00 2001
From: Samuel Sloniker <sam@kj7rrv.com>
Date: Tue, 26 Oct 2021 13:33:15 -0700
Subject: [PATCH] Reorganize code and improve README

---
 README.md          |  49 ++++++++++----------
 gptc/__init__.py   |   3 +-
 gptc/classifier.py |  51 +++++++++++++++++++++
 gptc/compiler.py   |  45 +++++++++++++++++++
 gptc/gptc.py       | 110 ---------------------------------------------
 gptc/tokenizer.py  |  11 +++++
 6 files changed, 134 insertions(+), 135 deletions(-)
 create mode 100755 gptc/classifier.py
 create mode 100755 gptc/compiler.py
 delete mode 100755 gptc/gptc.py
 create mode 100644 gptc/tokenizer.py
diff --git a/README.md b/README.md
index a5a1000..e512e92 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1,33 @@
 # GPTC
 General-purpose text classifier in Python
 
+GPTC provides both a CLI tool and a Python library.
+
 ## CLI Tool
-If you just want to do some simple classification on the command line, use the
-CLI tool. To use an existing model, <!-- When initialising a Classifier
-object, pass in the keyword argument `supress_uncompiled_model_warning=True`.
--->use `gptc <modelfile>`. It will prompt for a string, and classify it,
-outputting the category on stdout (or "unknown" if it cannot determine
-anything) See "Model format" for a description of the model. To compile a
-model, use `gptc <rawmodelfile> -c|--compile <compiledmodelfile>`.
+### Classifying text
+
+    `python -m gptc <modelfile>`
+
+This will prompt for a string and classify it, outputting the category on
+stdout (or "None" if it cannot determine
+anything).
+### Compiling models
+
+    gptc <raw model file> -c|--compile <compiled model file>
 
 ## Library
-If you want to use GPTC programmatically, use the library.
 ### `gptc.Classifier(model)`
-Create a `Classifier` object using the given model (as a Python list/dict, not
-as JSON). If the model is raw (a list), it will print a big warning on stderr.
-### `Classifier.classify(text)`
+Create a `Classifier` object using the given *compiled* model (as a dict, not
+JSON).
+#### `Classifier.classify(text)`
 Classify `text` with GPTC using the model used to instantiate the
 `Classifier`. Returns the category into which the text is placed (as a
 string), or `None` when it cannot classify the text.
+## `gptc.compile(raw_model)`
+Compile a raw model (as a list, not JSON) and return the compiled model (as a
+dict).
 
 ## Model format
-Since you never really need to mess with compiled models, I won't discuss
-them. You can read the code if you really need to figure them out.
-
 This section explains the raw model format, which is how you should create and
 edit models.
 
@@ -36,16 +40,13 @@ Raw models are formatted as a list of dicts. See below for the format:
         }
     ]
 
-Although GPTC handles models as Python lists (for raw models) or dicts (for
-compiled models), I recommend storing them in JSON format, mainly because the
-command-line tool uses JSON.
-
-## Example models
-I provide an example model trained to distinguish between texts written by
-Mark Twain and those written by William Shakespeare. I chose them because
-their works have all gone into the public domain, and their writing style is
-so different that GPTC can easily tell the difference, making it a good
-demonstration.
+GPTC handles models as Python `list`s of `dict`s of `str`s (for raw models) or
+`dict`s of `str`s and `float`s (for compiled models), and they can be stored
+in any way these Python objects can be. However, it is recommended to store
+them in JSON format for compatibility with the command-line tool.
 
+## Example model
+An example model, which is designed to distinguish between texts written by
+Mark Twain and those written by William Shakespeare, is available in `models`.
 The raw model is in `models/raw.json`; the compiled model is in
 `models/compiled.json`.
diff --git a/gptc/__init__.py b/gptc/__init__.py
index ab1d591..7707f2d 100644
--- a/gptc/__init__.py
+++ b/gptc/__init__.py
@@ -1,3 +1,4 @@
 """General-Purpose Text Classifier"""
 
-from gptc.gptc import compile, Classifier
+from gptc.compiler import compile
+from gptc.classifier import Classifier
diff --git a/gptc/classifier.py b/gptc/classifier.py
new file mode 100755
index 0000000..2a36ea5
--- /dev/null
+++ b/gptc/classifier.py
@@ -0,0 +1,51 @@
+import gptc.tokenizer
+
+class Classifier:
+    """A text classifier.
+
+    Parameters
+    ----------
+    model : dict
+        A compiled GPTC model.
+
+    Attributes
+    ----------
+    model : dict
+        The model used.
+
+    """
+
+    def __init__(self, model):
+        self.model = model
+
+    def classify(self, text):
+        """Classify text.
+
+        Parameters
+        ----------
+        text : str
+            The text to classify
+
+        Returns
+        -------
+        str or None
+            The most likely category, or None if no guess was made.
+
+        """
+
+        model = self.model
+        text = gptc.tokenizer.tokenize(text)
+        probs = {}
+        for word in text:
+            try:
+                for category, value in model[word].items():
+                    try:
+                        probs[category] += value
+                    except KeyError:
+                        probs[category] = value
+            except KeyError:
+                pass
+        try:
+            return sorted(probs.items(), key=lambda x: x[1])[-1][0]
+        except IndexError:
+            return None
diff --git a/gptc/compiler.py b/gptc/compiler.py
new file mode 100755
index 0000000..b2b557c
--- /dev/null
+++ b/gptc/compiler.py
@@ -0,0 +1,45 @@
+import gptc.tokenizer
+
+def compile(raw_model):
+    """Compile a raw model.
+
+    Parameters
+    ----------
+    raw_model : list of dict
+        A raw GPTC model.
+
+    Returns
+    -------
+    dict
+        A compiled GPTC model.
+
+    """
+
+    categories = {}
+
+    for portion in raw_model:
+        text = gptc.tokenizer.tokenize(portion['text'])
+        category = portion['category']
+        try:
+            categories[category] += text
+        except KeyError:
+            categories[category] = text
+
+    categories_by_count = {}
+    
+    for category, text in categories.items():
+        categories_by_count[category] = {}
+        for word in text:
+            try:
+                categories_by_count[category][word] += 1/len(categories[category])
+            except KeyError:
+                categories_by_count[category][word] = 1/len(categories[category])
+    word_weights = {}
+    for category, words in categories_by_count.items():
+        for word, value in words.items():
+            try:
+                word_weights[word][category] = value
+            except KeyError:
+                word_weights[word] = {category:value}
+
+    return word_weights
diff --git a/gptc/gptc.py b/gptc/gptc.py
deleted file mode 100755
index f76b12c..0000000
--- a/gptc/gptc.py
+++ /dev/null
@@ -1,110 +0,0 @@
-'''Main module for GPTC.'''
-
-import sys
-
-def _listify(text):
-    """Convert a string to a list of lemmas."""
-    out = [""]
-
-    for char in text.lower():
-        if char.isalpha() or char == "'":
-            out[-1] += char
-        elif out[-1] != "":
-            out.append("")
-
-    return [string for string in out if string]
-
-def compile(raw_model):
-    """Compile a raw model.
-
-    Parameters
-    ----------
-    raw_model : list of dict
-        A raw GPTC model.
-
-    Returns
-    -------
-    dict
-        A compiled GPTC model.
-
-    """
-
-    categories = {}
-
-    for portion in raw_model:
-        text = _listify(portion['text'])
-        category = portion['category']
-        try:
-            categories[category] += text
-        except KeyError:
-            categories[category] = text
-
-    categories_by_count = {}
-    
-    for category, text in categories.items():
-        categories_by_count[category] = {}
-        for word in text:
-            try:
-                categories_by_count[category][word] += 1/len(categories[category])
-            except KeyError:
-                categories_by_count[category][word] = 1/len(categories[category])
-    word_weights = {}
-    for category, words in categories_by_count.items():
-        for word, value in words.items():
-            try:
-                word_weights[word][category] = value
-            except KeyError:
-                word_weights[word] = {category:value}
-
-    return word_weights
-
-
-class Classifier:
-    """A text classifier.
-
-    Parameters
-    ----------
-    model : dict
-        A compiled GPTC model.
-
-    Attributes
-    ----------
-    model : dict
-        The model used.
-
-    """
-
-    def __init__(self, model):
-        self.model = model
-
-    def classify(self, text):
-        """Classify text.
-
-        Parameters
-        ----------
-        text : str
-            The text to classify
-
-        Returns
-        -------
-        str or None
-            The most likely category, or None if no guess was made.
-
-        """
-
-        model = self.model
-        text = _listify(text)
-        probs = {}
-        for word in text:
-            try:
-                for category, value in model[word].items():
-                    try:
-                        probs[category] += value
-                    except KeyError:
-                        probs[category] = value
-            except KeyError:
-                pass
-        try:
-            return sorted(probs.items(), key=lambda x: x[1])[-1][0]
-        except IndexError:
-            return None
diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py
new file mode 100644
index 0000000..193532a
--- /dev/null
+++ b/gptc/tokenizer.py
@@ -0,0 +1,11 @@
+def tokenize(text):
+    """Convert a string to a list of lemmas."""
+    out = [""]
+
+    for char in text.lower():
+        if char.isalpha() or char == "'":
+            out[-1] += char
+        elif out[-1] != "":
+            out.append("")
+
+    return [string for string in out if string]