From 91e35a0d6176b93323afa18b5484677fc459c2e1 Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Tue, 26 Oct 2021 13:33:15 -0700 Subject: [PATCH] Reorganize code and improve README --- README.md | 49 ++++++++++---------- gptc/__init__.py | 3 +- gptc/classifier.py | 51 +++++++++++++++++++++ gptc/compiler.py | 45 +++++++++++++++++++ gptc/gptc.py | 110 --------------------------------------------- gptc/tokenizer.py | 11 +++++ 6 files changed, 134 insertions(+), 135 deletions(-) create mode 100755 gptc/classifier.py create mode 100755 gptc/compiler.py delete mode 100755 gptc/gptc.py create mode 100644 gptc/tokenizer.py diff --git a/README.md b/README.md index a5a1000..e512e92 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,33 @@ # GPTC General-purpose text classifier in Python +GPTC provides both a CLI tool and a Python library. + ## CLI Tool -If you just want to do some simple classification on the command line, use the -CLI tool. To use an existing model, use `gptc `. It will prompt for a string, and classify it, -outputting the category on stdout (or "unknown" if it cannot determine -anything) See "Model format" for a description of the model. To compile a -model, use `gptc -c|--compile `. +### Classifying text + + `python -m gptc ` + +This will prompt for a string and classify it, outputting the category on +stdout (or "None" if it cannot determine +anything). +### Compiling models + + gptc -c|--compile ## Library -If you want to use GPTC programmatically, use the library. ### `gptc.Classifier(model)` -Create a `Classifier` object using the given model (as a Python list/dict, not -as JSON). If the model is raw (a list), it will print a big warning on stderr. -### `Classifier.classify(text)` +Create a `Classifier` object using the given *compiled* model (as a dict, not +JSON). +#### `Classifier.classify(text)` Classify `text` with GPTC using the model used to instantiate the `Classifier`. Returns the category into which the text is placed (as a string), or `None` when it cannot classify the text. +## `gptc.compile(raw_model)` +Compile a raw model (as a list, not JSON) and return the compiled model (as a +dict). ## Model format -Since you never really need to mess with compiled models, I won't discuss -them. You can read the code if you really need to figure them out. - This section explains the raw model format, which is how you should create and edit models. @@ -36,16 +40,13 @@ Raw models are formatted as a list of dicts. See below for the format: } ] -Although GPTC handles models as Python lists (for raw models) or dicts (for -compiled models), I recommend storing them in JSON format, mainly because the -command-line tool uses JSON. - -## Example models -I provide an example model trained to distinguish between texts written by -Mark Twain and those written by William Shakespeare. I chose them because -their works have all gone into the public domain, and their writing style is -so different that GPTC can easily tell the difference, making it a good -demonstration. +GPTC handles models as Python `list`s of `dict`s of `str`s (for raw models) or +`dict`s of `str`s and `float`s (for compiled models), and they can be stored +in any way these Python objects can be. However, it is recommended to store +them in JSON format for compatibility with the command-line tool. +## Example model +An example model, which is designed to distinguish between texts written by +Mark Twain and those written by William Shakespeare, is available in `models`. The raw model is in `models/raw.json`; the compiled model is in `models/compiled.json`. diff --git a/gptc/__init__.py b/gptc/__init__.py index ab1d591..7707f2d 100644 --- a/gptc/__init__.py +++ b/gptc/__init__.py @@ -1,3 +1,4 @@ """General-Purpose Text Classifier""" -from gptc.gptc import compile, Classifier +from gptc.compiler import compile +from gptc.classifier import Classifier diff --git a/gptc/classifier.py b/gptc/classifier.py new file mode 100755 index 0000000..2a36ea5 --- /dev/null +++ b/gptc/classifier.py @@ -0,0 +1,51 @@ +import gptc.tokenizer + +class Classifier: + """A text classifier. + + Parameters + ---------- + model : dict + A compiled GPTC model. + + Attributes + ---------- + model : dict + The model used. + + """ + + def __init__(self, model): + self.model = model + + def classify(self, text): + """Classify text. + + Parameters + ---------- + text : str + The text to classify + + Returns + ------- + str or None + The most likely category, or None if no guess was made. + + """ + + model = self.model + text = gptc.tokenizer.tokenize(text) + probs = {} + for word in text: + try: + for category, value in model[word].items(): + try: + probs[category] += value + except KeyError: + probs[category] = value + except KeyError: + pass + try: + return sorted(probs.items(), key=lambda x: x[1])[-1][0] + except IndexError: + return None diff --git a/gptc/compiler.py b/gptc/compiler.py new file mode 100755 index 0000000..b2b557c --- /dev/null +++ b/gptc/compiler.py @@ -0,0 +1,45 @@ +import gptc.tokenizer + +def compile(raw_model): + """Compile a raw model. + + Parameters + ---------- + raw_model : list of dict + A raw GPTC model. + + Returns + ------- + dict + A compiled GPTC model. + + """ + + categories = {} + + for portion in raw_model: + text = gptc.tokenizer.tokenize(portion['text']) + category = portion['category'] + try: + categories[category] += text + except KeyError: + categories[category] = text + + categories_by_count = {} + + for category, text in categories.items(): + categories_by_count[category] = {} + for word in text: + try: + categories_by_count[category][word] += 1/len(categories[category]) + except KeyError: + categories_by_count[category][word] = 1/len(categories[category]) + word_weights = {} + for category, words in categories_by_count.items(): + for word, value in words.items(): + try: + word_weights[word][category] = value + except KeyError: + word_weights[word] = {category:value} + + return word_weights diff --git a/gptc/gptc.py b/gptc/gptc.py deleted file mode 100755 index f76b12c..0000000 --- a/gptc/gptc.py +++ /dev/null @@ -1,110 +0,0 @@ -'''Main module for GPTC.''' - -import sys - -def _listify(text): - """Convert a string to a list of lemmas.""" - out = [""] - - for char in text.lower(): - if char.isalpha() or char == "'": - out[-1] += char - elif out[-1] != "": - out.append("") - - return [string for string in out if string] - -def compile(raw_model): - """Compile a raw model. - - Parameters - ---------- - raw_model : list of dict - A raw GPTC model. - - Returns - ------- - dict - A compiled GPTC model. - - """ - - categories = {} - - for portion in raw_model: - text = _listify(portion['text']) - category = portion['category'] - try: - categories[category] += text - except KeyError: - categories[category] = text - - categories_by_count = {} - - for category, text in categories.items(): - categories_by_count[category] = {} - for word in text: - try: - categories_by_count[category][word] += 1/len(categories[category]) - except KeyError: - categories_by_count[category][word] = 1/len(categories[category]) - word_weights = {} - for category, words in categories_by_count.items(): - for word, value in words.items(): - try: - word_weights[word][category] = value - except KeyError: - word_weights[word] = {category:value} - - return word_weights - - -class Classifier: - """A text classifier. - - Parameters - ---------- - model : dict - A compiled GPTC model. - - Attributes - ---------- - model : dict - The model used. - - """ - - def __init__(self, model): - self.model = model - - def classify(self, text): - """Classify text. - - Parameters - ---------- - text : str - The text to classify - - Returns - ------- - str or None - The most likely category, or None if no guess was made. - - """ - - model = self.model - text = _listify(text) - probs = {} - for word in text: - try: - for category, value in model[word].items(): - try: - probs[category] += value - except KeyError: - probs[category] = value - except KeyError: - pass - try: - return sorted(probs.items(), key=lambda x: x[1])[-1][0] - except IndexError: - return None diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py new file mode 100644 index 0000000..193532a --- /dev/null +++ b/gptc/tokenizer.py @@ -0,0 +1,11 @@ +def tokenize(text): + """Convert a string to a list of lemmas.""" + out = [""] + + for char in text.lower(): + if char.isalpha() or char == "'": + out[-1] += char + elif out[-1] != "": + out.append("") + + return [string for string in out if string]