Reorganize code and improve README
This commit is contained in:
parent
e1267b62b0
commit
91e35a0d61
49
README.md
49
README.md
|
@ -1,29 +1,33 @@
|
|||
# GPTC
|
||||
General-purpose text classifier in Python
|
||||
|
||||
GPTC provides both a CLI tool and a Python library.
|
||||
|
||||
## CLI Tool
|
||||
If you just want to do some simple classification on the command line, use the
|
||||
CLI tool. To use an existing model, <!-- When initialising a Classifier
|
||||
object, pass in the keyword argument `supress_uncompiled_model_warning=True`.
|
||||
-->use `gptc <modelfile>`. It will prompt for a string, and classify it,
|
||||
outputting the category on stdout (or "unknown" if it cannot determine
|
||||
anything) See "Model format" for a description of the model. To compile a
|
||||
model, use `gptc <rawmodelfile> -c|--compile <compiledmodelfile>`.
|
||||
### Classifying text
|
||||
|
||||
`python -m gptc <modelfile>`
|
||||
|
||||
This will prompt for a string and classify it, outputting the category on
|
||||
stdout (or "None" if it cannot determine
|
||||
anything).
|
||||
### Compiling models
|
||||
|
||||
gptc <raw model file> -c|--compile <compiled model file>
|
||||
|
||||
## Library
|
||||
If you want to use GPTC programmatically, use the library.
|
||||
### `gptc.Classifier(model)`
|
||||
Create a `Classifier` object using the given model (as a Python list/dict, not
|
||||
as JSON). If the model is raw (a list), it will print a big warning on stderr.
|
||||
### `Classifier.classify(text)`
|
||||
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
||||
JSON).
|
||||
#### `Classifier.classify(text)`
|
||||
Classify `text` with GPTC using the model used to instantiate the
|
||||
`Classifier`. Returns the category into which the text is placed (as a
|
||||
string), or `None` when it cannot classify the text.
|
||||
## `gptc.compile(raw_model)`
|
||||
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
||||
dict).
|
||||
|
||||
## Model format
|
||||
Since you never really need to mess with compiled models, I won't discuss
|
||||
them. You can read the code if you really need to figure them out.
|
||||
|
||||
This section explains the raw model format, which is how you should create and
|
||||
edit models.
|
||||
|
||||
|
@ -36,16 +40,13 @@ Raw models are formatted as a list of dicts. See below for the format:
|
|||
}
|
||||
]
|
||||
|
||||
Although GPTC handles models as Python lists (for raw models) or dicts (for
|
||||
compiled models), I recommend storing them in JSON format, mainly because the
|
||||
command-line tool uses JSON.
|
||||
|
||||
## Example models
|
||||
I provide an example model trained to distinguish between texts written by
|
||||
Mark Twain and those written by William Shakespeare. I chose them because
|
||||
their works have all gone into the public domain, and their writing style is
|
||||
so different that GPTC can easily tell the difference, making it a good
|
||||
demonstration.
|
||||
GPTC handles models as Python `list`s of `dict`s of `str`s (for raw models) or
|
||||
`dict`s of `str`s and `float`s (for compiled models), and they can be stored
|
||||
in any way these Python objects can be. However, it is recommended to store
|
||||
them in JSON format for compatibility with the command-line tool.
|
||||
|
||||
## Example model
|
||||
An example model, which is designed to distinguish between texts written by
|
||||
Mark Twain and those written by William Shakespeare, is available in `models`.
|
||||
The raw model is in `models/raw.json`; the compiled model is in
|
||||
`models/compiled.json`.
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
"""General-Purpose Text Classifier"""
|
||||
|
||||
from gptc.gptc import compile, Classifier
|
||||
from gptc.compiler import compile
|
||||
from gptc.classifier import Classifier
|
||||
|
|
51
gptc/classifier.py
Executable file
51
gptc/classifier.py
Executable file
|
@ -0,0 +1,51 @@
|
|||
import gptc.tokenizer
|
||||
|
||||
class Classifier:
|
||||
"""A text classifier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model : dict
|
||||
A compiled GPTC model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
model : dict
|
||||
The model used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, model):
|
||||
self.model = model
|
||||
|
||||
def classify(self, text):
|
||||
"""Classify text.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
The text to classify
|
||||
|
||||
Returns
|
||||
-------
|
||||
str or None
|
||||
The most likely category, or None if no guess was made.
|
||||
|
||||
"""
|
||||
|
||||
model = self.model
|
||||
text = gptc.tokenizer.tokenize(text)
|
||||
probs = {}
|
||||
for word in text:
|
||||
try:
|
||||
for category, value in model[word].items():
|
||||
try:
|
||||
probs[category] += value
|
||||
except KeyError:
|
||||
probs[category] = value
|
||||
except KeyError:
|
||||
pass
|
||||
try:
|
||||
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
|
||||
except IndexError:
|
||||
return None
|
45
gptc/compiler.py
Executable file
45
gptc/compiler.py
Executable file
|
@ -0,0 +1,45 @@
|
|||
import gptc.tokenizer
|
||||
|
||||
def compile(raw_model):
|
||||
"""Compile a raw model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_model : list of dict
|
||||
A raw GPTC model.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A compiled GPTC model.
|
||||
|
||||
"""
|
||||
|
||||
categories = {}
|
||||
|
||||
for portion in raw_model:
|
||||
text = gptc.tokenizer.tokenize(portion['text'])
|
||||
category = portion['category']
|
||||
try:
|
||||
categories[category] += text
|
||||
except KeyError:
|
||||
categories[category] = text
|
||||
|
||||
categories_by_count = {}
|
||||
|
||||
for category, text in categories.items():
|
||||
categories_by_count[category] = {}
|
||||
for word in text:
|
||||
try:
|
||||
categories_by_count[category][word] += 1/len(categories[category])
|
||||
except KeyError:
|
||||
categories_by_count[category][word] = 1/len(categories[category])
|
||||
word_weights = {}
|
||||
for category, words in categories_by_count.items():
|
||||
for word, value in words.items():
|
||||
try:
|
||||
word_weights[word][category] = value
|
||||
except KeyError:
|
||||
word_weights[word] = {category:value}
|
||||
|
||||
return word_weights
|
110
gptc/gptc.py
110
gptc/gptc.py
|
@ -1,110 +0,0 @@
|
|||
'''Main module for GPTC.'''
|
||||
|
||||
import sys
|
||||
|
||||
def _listify(text):
|
||||
"""Convert a string to a list of lemmas."""
|
||||
out = [""]
|
||||
|
||||
for char in text.lower():
|
||||
if char.isalpha() or char == "'":
|
||||
out[-1] += char
|
||||
elif out[-1] != "":
|
||||
out.append("")
|
||||
|
||||
return [string for string in out if string]
|
||||
|
||||
def compile(raw_model):
|
||||
"""Compile a raw model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_model : list of dict
|
||||
A raw GPTC model.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A compiled GPTC model.
|
||||
|
||||
"""
|
||||
|
||||
categories = {}
|
||||
|
||||
for portion in raw_model:
|
||||
text = _listify(portion['text'])
|
||||
category = portion['category']
|
||||
try:
|
||||
categories[category] += text
|
||||
except KeyError:
|
||||
categories[category] = text
|
||||
|
||||
categories_by_count = {}
|
||||
|
||||
for category, text in categories.items():
|
||||
categories_by_count[category] = {}
|
||||
for word in text:
|
||||
try:
|
||||
categories_by_count[category][word] += 1/len(categories[category])
|
||||
except KeyError:
|
||||
categories_by_count[category][word] = 1/len(categories[category])
|
||||
word_weights = {}
|
||||
for category, words in categories_by_count.items():
|
||||
for word, value in words.items():
|
||||
try:
|
||||
word_weights[word][category] = value
|
||||
except KeyError:
|
||||
word_weights[word] = {category:value}
|
||||
|
||||
return word_weights
|
||||
|
||||
|
||||
class Classifier:
|
||||
"""A text classifier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model : dict
|
||||
A compiled GPTC model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
model : dict
|
||||
The model used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, model):
|
||||
self.model = model
|
||||
|
||||
def classify(self, text):
|
||||
"""Classify text.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text : str
|
||||
The text to classify
|
||||
|
||||
Returns
|
||||
-------
|
||||
str or None
|
||||
The most likely category, or None if no guess was made.
|
||||
|
||||
"""
|
||||
|
||||
model = self.model
|
||||
text = _listify(text)
|
||||
probs = {}
|
||||
for word in text:
|
||||
try:
|
||||
for category, value in model[word].items():
|
||||
try:
|
||||
probs[category] += value
|
||||
except KeyError:
|
||||
probs[category] = value
|
||||
except KeyError:
|
||||
pass
|
||||
try:
|
||||
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
|
||||
except IndexError:
|
||||
return None
|
11
gptc/tokenizer.py
Normal file
11
gptc/tokenizer.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
def tokenize(text):
|
||||
"""Convert a string to a list of lemmas."""
|
||||
out = [""]
|
||||
|
||||
for char in text.lower():
|
||||
if char.isalpha() or char == "'":
|
||||
out[-1] += char
|
||||
elif out[-1] != "":
|
||||
out.append("")
|
||||
|
||||
return [string for string in out if string]
|
Loading…
Reference in New Issue
Block a user