Setup
This commit is contained in:
commit
dae17ebcf6
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
__pycache__
|
||||||
|
*.swp
|
3
MANIFEST
Normal file
3
MANIFEST
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
# file GENERATED by distutils, do NOT edit
|
||||||
|
setup.py
|
||||||
|
gptc/__init__.py
|
58
README.md
Normal file
58
README.md
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
# GPTC
|
||||||
|
General-purpose text classifier in Python
|
||||||
|
|
||||||
|
## CLI Tool
|
||||||
|
If you just want to do some simple classification on the command line, use the
|
||||||
|
CLI tool. To use an existing model, <!-- When initialising a Classifier
|
||||||
|
object, pass in the keyword argument `supress_uncompiled_model_warning=True`.
|
||||||
|
-->use `gptc <modelfile>`. It will prompt for a string, and classify it,
|
||||||
|
outputting the category on stdout (or "unknown" if it cannot determine
|
||||||
|
anything) See "Model format" for a description of the model. To compile a
|
||||||
|
model, use `gptc <rawmodelfile> -c|--compile <compiledmodelfile>`.
|
||||||
|
|
||||||
|
## Library
|
||||||
|
If you want to use GPTC programmatically, use the library.
|
||||||
|
### `gptc.Classifier(model)`
|
||||||
|
Create a `Classifier` object using the given model (as a Python list/dict, not
|
||||||
|
as JSON). If the model is raw (a list), it will print a big warning on stderr.
|
||||||
|
### `Classifier.check(text)`
|
||||||
|
Classify `text` with GPTC using the model used to instantiate the
|
||||||
|
`Classifier`. Returns the category into which the text is placed (as a
|
||||||
|
string), or `'unknown'` when it cannot classify the text.
|
||||||
|
|
||||||
|
## Model format
|
||||||
|
Since you never really need to mess with compiled models, I won't discuss
|
||||||
|
them. You can read the code if you really need to figure them out.
|
||||||
|
|
||||||
|
This section explains the raw model format, which is how you should create and
|
||||||
|
edit models.
|
||||||
|
|
||||||
|
Raw models are formatted as a list of dicts. See below for the format:
|
||||||
|
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"text": "<text in the category>",
|
||||||
|
"category": "<the category>"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
Although GPTC handles models as Python lists (for raw models) or dicts (for
|
||||||
|
compiled models), I recommend storing them in JSON format, mainly because the
|
||||||
|
command-line tool uses JSON.
|
||||||
|
|
||||||
|
You can use a raw model anywhere you can use a compiled model. However, both
|
||||||
|
the library and the CLI tool will print a big warning to stderr if you do
|
||||||
|
this. There is a comment in a random place in this document explaining how to
|
||||||
|
disable this in the library. (It's in a comment so you can't do it without
|
||||||
|
some effort. The warning cannot be disabled in the CLI program without hacking
|
||||||
|
the source.
|
||||||
|
|
||||||
|
## Example models
|
||||||
|
I provide an example model trained to distinguish between texts written by
|
||||||
|
Mark Twain and those written by William Shakespeare. I chose them because
|
||||||
|
their works have all gone into the public domain, and their writing style is
|
||||||
|
so different that GPTC can easily tell the difference, making it a good
|
||||||
|
demonstration.
|
||||||
|
|
||||||
|
The raw model is in `twain_shakespeare_raw.json`; the compiled model is in
|
||||||
|
`twain_shakespeare.json`.
|
BIN
dist/gptc-0.0.0.tar.gz
vendored
Normal file
BIN
dist/gptc-0.0.0.tar.gz
vendored
Normal file
Binary file not shown.
97
gptc/__init__.py
Executable file
97
gptc/__init__.py
Executable file
|
@ -0,0 +1,97 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import spacy
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
|
||||||
|
def listify(text):
|
||||||
|
return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ']
|
||||||
|
|
||||||
|
|
||||||
|
def compile(raw_model):
|
||||||
|
model = {}
|
||||||
|
|
||||||
|
for portion in raw_model:
|
||||||
|
text = listify(portion['text'])
|
||||||
|
category = portion['category']
|
||||||
|
for word in text:
|
||||||
|
try:
|
||||||
|
model[category].append(word)
|
||||||
|
except:
|
||||||
|
model[category] = [word]
|
||||||
|
model[category].sort()
|
||||||
|
all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ]
|
||||||
|
for test_model in all_models:
|
||||||
|
correct = 0
|
||||||
|
classifier = Classifier(test_model)
|
||||||
|
for text in raw_model:
|
||||||
|
if classifier.check(text['text']) == text['category']:
|
||||||
|
correct += 1
|
||||||
|
test_model['correct'] = correct
|
||||||
|
print('tested a model')
|
||||||
|
best = all_models[0]
|
||||||
|
for test_model in all_models:
|
||||||
|
if test_model['correct'] > best['correct']:
|
||||||
|
best = test_model
|
||||||
|
del best['correct']
|
||||||
|
return best
|
||||||
|
return {'text': model}
|
||||||
|
|
||||||
|
|
||||||
|
class Classifier:
|
||||||
|
def __init__(self, model, supress_uncompiled_model_warning=False):
|
||||||
|
if type(model['text']) == dict:
|
||||||
|
self.model = model
|
||||||
|
else:
|
||||||
|
self.model = compile(model)
|
||||||
|
if not supress_uncompiled_model_warning:
|
||||||
|
print('WARNING: model was not compiled', file=sys.stderr)
|
||||||
|
print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr)
|
||||||
|
self.warn = supress_uncompiled_model_warning
|
||||||
|
|
||||||
|
def check(self, text):
|
||||||
|
model = self.model
|
||||||
|
stopword_value = 0.5
|
||||||
|
try:
|
||||||
|
stopword_value = model['stopword']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
stopwords = spacy.lang.en.stop_words.STOP_WORDS
|
||||||
|
model = model['text']
|
||||||
|
text = listify(text)
|
||||||
|
probs = {}
|
||||||
|
for word in text:
|
||||||
|
for category in model.keys():
|
||||||
|
for catword in model[category]:
|
||||||
|
if word == catword:
|
||||||
|
weight = ( stopword_value if word in stopwords else 1 ) / len(model[category])
|
||||||
|
try:
|
||||||
|
probs[category] += weight
|
||||||
|
except:
|
||||||
|
probs[category] = weight
|
||||||
|
most_likely = ['unknown', 0]
|
||||||
|
for category in probs.keys():
|
||||||
|
if probs[category] > most_likely[1]:
|
||||||
|
most_likely = [category, probs[category]]
|
||||||
|
return most_likely[0]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
||||||
|
parser.add_argument('model', help='model to use')
|
||||||
|
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
|
||||||
|
args = parser.parse_args()
|
||||||
|
with open(args.model, 'r') as f:
|
||||||
|
raw_model = json.load(f)
|
||||||
|
if args.compile:
|
||||||
|
with open(args.compile, 'w+') as f:
|
||||||
|
json.dump(compile(raw_model), f)
|
||||||
|
else:
|
||||||
|
classifier = Classifier(raw_model)
|
||||||
|
if sys.stdin.isatty():
|
||||||
|
text = input('Text to analyse: ')
|
||||||
|
else:
|
||||||
|
text = sys.stdin.read()
|
||||||
|
print(classifier.check(text))
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
spacy
|
26
setup.py
Normal file
26
setup.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
from distutils.core import setup
|
||||||
|
setup(
|
||||||
|
name = 'gptc', # How you named your package folder (MyLib)
|
||||||
|
packages = ['gptc'], # Chose the same as "name"
|
||||||
|
version = '0.0.0', # Start with a small number and increase it with every change you make
|
||||||
|
license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository
|
||||||
|
description = 'General-purpose English text classifier', # Give a short description about your library
|
||||||
|
author = 'ScoopGracie', # Type in your name
|
||||||
|
author_email = 'scoopgracie@scoopgracie.com', # Type in your E-Mail
|
||||||
|
url = 'https://github.com/scoopgracie/gptc', # Provide either the link to your github or to your website
|
||||||
|
keywords = ['nlp', 'text', 'classification'], # Keywords that define your package best
|
||||||
|
install_requires=[ # I get to this in a second
|
||||||
|
'spacy',
|
||||||
|
],
|
||||||
|
classifiers=[
|
||||||
|
'Development Status :: 4 - Beta', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
|
||||||
|
'Intended Audience :: Developers', # Define that your audience are developers
|
||||||
|
'License :: OSI Approved :: MIT License', # Again, pick a license
|
||||||
|
'Programming Language :: Python :: 3', #Specify which pyhton versions that you want to support
|
||||||
|
'Programming Language :: Python :: 3.5',
|
||||||
|
'Programming Language :: Python :: 3.6',
|
||||||
|
'Programming Language :: Python :: 3.7',
|
||||||
|
'Programming Language :: Python :: 3.8',
|
||||||
|
'Programming Language :: Python :: 3.9',
|
||||||
|
],
|
||||||
|
)
|
1
twain_shakespeare.json
Normal file
1
twain_shakespeare.json
Normal file
File diff suppressed because one or more lines are too long
1026
twain_shakespeare_raw.json
Normal file
1026
twain_shakespeare_raw.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user