Reorganize code and improve README

This commit is contained in:
Samuel Sloniker 2021-10-26 13:33:15 -07:00
parent e1267b62b0
commit 91e35a0d61
6 changed files with 134 additions and 135 deletions

View File

@ -1,29 +1,33 @@
# GPTC
General-purpose text classifier in Python
GPTC provides both a CLI tool and a Python library.
## CLI Tool
If you just want to do some simple classification on the command line, use the
CLI tool. To use an existing model, <!-- When initialising a Classifier
object, pass in the keyword argument `supress_uncompiled_model_warning=True`.
-->use `gptc <modelfile>`. It will prompt for a string, and classify it,
outputting the category on stdout (or "unknown" if it cannot determine
anything) See "Model format" for a description of the model. To compile a
model, use `gptc <rawmodelfile> -c|--compile <compiledmodelfile>`.
### Classifying text
`python -m gptc <modelfile>`
This will prompt for a string and classify it, outputting the category on
stdout (or "None" if it cannot determine
anything).
### Compiling models
gptc <raw model file> -c|--compile <compiled model file>
## Library
If you want to use GPTC programmatically, use the library.
### `gptc.Classifier(model)`
Create a `Classifier` object using the given model (as a Python list/dict, not
as JSON). If the model is raw (a list), it will print a big warning on stderr.
### `Classifier.classify(text)`
Create a `Classifier` object using the given *compiled* model (as a dict, not
JSON).
#### `Classifier.classify(text)`
Classify `text` with GPTC using the model used to instantiate the
`Classifier`. Returns the category into which the text is placed (as a
string), or `None` when it cannot classify the text.
## `gptc.compile(raw_model)`
Compile a raw model (as a list, not JSON) and return the compiled model (as a
dict).
## Model format
Since you never really need to mess with compiled models, I won't discuss
them. You can read the code if you really need to figure them out.
This section explains the raw model format, which is how you should create and
edit models.
@ -36,16 +40,13 @@ Raw models are formatted as a list of dicts. See below for the format:
}
]
Although GPTC handles models as Python lists (for raw models) or dicts (for
compiled models), I recommend storing them in JSON format, mainly because the
command-line tool uses JSON.
## Example models
I provide an example model trained to distinguish between texts written by
Mark Twain and those written by William Shakespeare. I chose them because
their works have all gone into the public domain, and their writing style is
so different that GPTC can easily tell the difference, making it a good
demonstration.
GPTC handles models as Python `list`s of `dict`s of `str`s (for raw models) or
`dict`s of `str`s and `float`s (for compiled models), and they can be stored
in any way these Python objects can be. However, it is recommended to store
them in JSON format for compatibility with the command-line tool.
## Example model
An example model, which is designed to distinguish between texts written by
Mark Twain and those written by William Shakespeare, is available in `models`.
The raw model is in `models/raw.json`; the compiled model is in
`models/compiled.json`.

View File

@ -1,3 +1,4 @@
"""General-Purpose Text Classifier"""
from gptc.gptc import compile, Classifier
from gptc.compiler import compile
from gptc.classifier import Classifier

51
gptc/classifier.py Executable file
View File

@ -0,0 +1,51 @@
import gptc.tokenizer
class Classifier:
"""A text classifier.
Parameters
----------
model : dict
A compiled GPTC model.
Attributes
----------
model : dict
The model used.
"""
def __init__(self, model):
self.model = model
def classify(self, text):
"""Classify text.
Parameters
----------
text : str
The text to classify
Returns
-------
str or None
The most likely category, or None if no guess was made.
"""
model = self.model
text = gptc.tokenizer.tokenize(text)
probs = {}
for word in text:
try:
for category, value in model[word].items():
try:
probs[category] += value
except KeyError:
probs[category] = value
except KeyError:
pass
try:
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
except IndexError:
return None

45
gptc/compiler.py Executable file
View File

@ -0,0 +1,45 @@
import gptc.tokenizer
def compile(raw_model):
"""Compile a raw model.
Parameters
----------
raw_model : list of dict
A raw GPTC model.
Returns
-------
dict
A compiled GPTC model.
"""
categories = {}
for portion in raw_model:
text = gptc.tokenizer.tokenize(portion['text'])
category = portion['category']
try:
categories[category] += text
except KeyError:
categories[category] = text
categories_by_count = {}
for category, text in categories.items():
categories_by_count[category] = {}
for word in text:
try:
categories_by_count[category][word] += 1/len(categories[category])
except KeyError:
categories_by_count[category][word] = 1/len(categories[category])
word_weights = {}
for category, words in categories_by_count.items():
for word, value in words.items():
try:
word_weights[word][category] = value
except KeyError:
word_weights[word] = {category:value}
return word_weights

View File

@ -1,110 +0,0 @@
'''Main module for GPTC.'''
import sys
def _listify(text):
"""Convert a string to a list of lemmas."""
out = [""]
for char in text.lower():
if char.isalpha() or char == "'":
out[-1] += char
elif out[-1] != "":
out.append("")
return [string for string in out if string]
def compile(raw_model):
"""Compile a raw model.
Parameters
----------
raw_model : list of dict
A raw GPTC model.
Returns
-------
dict
A compiled GPTC model.
"""
categories = {}
for portion in raw_model:
text = _listify(portion['text'])
category = portion['category']
try:
categories[category] += text
except KeyError:
categories[category] = text
categories_by_count = {}
for category, text in categories.items():
categories_by_count[category] = {}
for word in text:
try:
categories_by_count[category][word] += 1/len(categories[category])
except KeyError:
categories_by_count[category][word] = 1/len(categories[category])
word_weights = {}
for category, words in categories_by_count.items():
for word, value in words.items():
try:
word_weights[word][category] = value
except KeyError:
word_weights[word] = {category:value}
return word_weights
class Classifier:
"""A text classifier.
Parameters
----------
model : dict
A compiled GPTC model.
Attributes
----------
model : dict
The model used.
"""
def __init__(self, model):
self.model = model
def classify(self, text):
"""Classify text.
Parameters
----------
text : str
The text to classify
Returns
-------
str or None
The most likely category, or None if no guess was made.
"""
model = self.model
text = _listify(text)
probs = {}
for word in text:
try:
for category, value in model[word].items():
try:
probs[category] += value
except KeyError:
probs[category] = value
except KeyError:
pass
try:
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
except IndexError:
return None

11
gptc/tokenizer.py Normal file
View File

@ -0,0 +1,11 @@
def tokenize(text):
"""Convert a string to a list of lemmas."""
out = [""]
for char in text.lower():
if char.isalpha() or char == "'":
out[-1] += char
elif out[-1] != "":
out.append("")
return [string for string in out if string]