Setup
This commit is contained in:
commit
dae17ebcf6
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
__pycache__
|
||||
*.swp
|
3
MANIFEST
Normal file
3
MANIFEST
Normal file
|
@ -0,0 +1,3 @@
|
|||
# file GENERATED by distutils, do NOT edit
|
||||
setup.py
|
||||
gptc/__init__.py
|
58
README.md
Normal file
58
README.md
Normal file
|
@ -0,0 +1,58 @@
|
|||
# GPTC
|
||||
General-purpose text classifier in Python
|
||||
|
||||
## CLI Tool
|
||||
If you just want to do some simple classification on the command line, use the
|
||||
CLI tool. To use an existing model, <!-- When initialising a Classifier
|
||||
object, pass in the keyword argument `supress_uncompiled_model_warning=True`.
|
||||
-->use `gptc <modelfile>`. It will prompt for a string, and classify it,
|
||||
outputting the category on stdout (or "unknown" if it cannot determine
|
||||
anything) See "Model format" for a description of the model. To compile a
|
||||
model, use `gptc <rawmodelfile> -c|--compile <compiledmodelfile>`.
|
||||
|
||||
## Library
|
||||
If you want to use GPTC programmatically, use the library.
|
||||
### `gptc.Classifier(model)`
|
||||
Create a `Classifier` object using the given model (as a Python list/dict, not
|
||||
as JSON). If the model is raw (a list), it will print a big warning on stderr.
|
||||
### `Classifier.check(text)`
|
||||
Classify `text` with GPTC using the model used to instantiate the
|
||||
`Classifier`. Returns the category into which the text is placed (as a
|
||||
string), or `'unknown'` when it cannot classify the text.
|
||||
|
||||
## Model format
|
||||
Since you never really need to mess with compiled models, I won't discuss
|
||||
them. You can read the code if you really need to figure them out.
|
||||
|
||||
This section explains the raw model format, which is how you should create and
|
||||
edit models.
|
||||
|
||||
Raw models are formatted as a list of dicts. See below for the format:
|
||||
|
||||
[
|
||||
{
|
||||
"text": "<text in the category>",
|
||||
"category": "<the category>"
|
||||
}
|
||||
]
|
||||
|
||||
Although GPTC handles models as Python lists (for raw models) or dicts (for
|
||||
compiled models), I recommend storing them in JSON format, mainly because the
|
||||
command-line tool uses JSON.
|
||||
|
||||
You can use a raw model anywhere you can use a compiled model. However, both
|
||||
the library and the CLI tool will print a big warning to stderr if you do
|
||||
this. There is a comment in a random place in this document explaining how to
|
||||
disable this in the library. (It's in a comment so you can't do it without
|
||||
some effort. The warning cannot be disabled in the CLI program without hacking
|
||||
the source.
|
||||
|
||||
## Example models
|
||||
I provide an example model trained to distinguish between texts written by
|
||||
Mark Twain and those written by William Shakespeare. I chose them because
|
||||
their works have all gone into the public domain, and their writing style is
|
||||
so different that GPTC can easily tell the difference, making it a good
|
||||
demonstration.
|
||||
|
||||
The raw model is in `twain_shakespeare_raw.json`; the compiled model is in
|
||||
`twain_shakespeare.json`.
|
BIN
dist/gptc-0.0.0.tar.gz
vendored
Normal file
BIN
dist/gptc-0.0.0.tar.gz
vendored
Normal file
Binary file not shown.
97
gptc/__init__.py
Executable file
97
gptc/__init__.py
Executable file
|
@ -0,0 +1,97 @@
|
|||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
import spacy
|
||||
import argparse
|
||||
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
|
||||
def listify(text):
|
||||
return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ']
|
||||
|
||||
|
||||
def compile(raw_model):
|
||||
model = {}
|
||||
|
||||
for portion in raw_model:
|
||||
text = listify(portion['text'])
|
||||
category = portion['category']
|
||||
for word in text:
|
||||
try:
|
||||
model[category].append(word)
|
||||
except:
|
||||
model[category] = [word]
|
||||
model[category].sort()
|
||||
all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ]
|
||||
for test_model in all_models:
|
||||
correct = 0
|
||||
classifier = Classifier(test_model)
|
||||
for text in raw_model:
|
||||
if classifier.check(text['text']) == text['category']:
|
||||
correct += 1
|
||||
test_model['correct'] = correct
|
||||
print('tested a model')
|
||||
best = all_models[0]
|
||||
for test_model in all_models:
|
||||
if test_model['correct'] > best['correct']:
|
||||
best = test_model
|
||||
del best['correct']
|
||||
return best
|
||||
return {'text': model}
|
||||
|
||||
|
||||
class Classifier:
|
||||
def __init__(self, model, supress_uncompiled_model_warning=False):
|
||||
if type(model['text']) == dict:
|
||||
self.model = model
|
||||
else:
|
||||
self.model = compile(model)
|
||||
if not supress_uncompiled_model_warning:
|
||||
print('WARNING: model was not compiled', file=sys.stderr)
|
||||
print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr)
|
||||
self.warn = supress_uncompiled_model_warning
|
||||
|
||||
def check(self, text):
|
||||
model = self.model
|
||||
stopword_value = 0.5
|
||||
try:
|
||||
stopword_value = model['stopword']
|
||||
except:
|
||||
pass
|
||||
stopwords = spacy.lang.en.stop_words.STOP_WORDS
|
||||
model = model['text']
|
||||
text = listify(text)
|
||||
probs = {}
|
||||
for word in text:
|
||||
for category in model.keys():
|
||||
for catword in model[category]:
|
||||
if word == catword:
|
||||
weight = ( stopword_value if word in stopwords else 1 ) / len(model[category])
|
||||
try:
|
||||
probs[category] += weight
|
||||
except:
|
||||
probs[category] = weight
|
||||
most_likely = ['unknown', 0]
|
||||
for category in probs.keys():
|
||||
if probs[category] > most_likely[1]:
|
||||
most_likely = [category, probs[category]]
|
||||
return most_likely[0]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
||||
parser.add_argument('model', help='model to use')
|
||||
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
|
||||
args = parser.parse_args()
|
||||
with open(args.model, 'r') as f:
|
||||
raw_model = json.load(f)
|
||||
if args.compile:
|
||||
with open(args.compile, 'w+') as f:
|
||||
json.dump(compile(raw_model), f)
|
||||
else:
|
||||
classifier = Classifier(raw_model)
|
||||
if sys.stdin.isatty():
|
||||
text = input('Text to analyse: ')
|
||||
else:
|
||||
text = sys.stdin.read()
|
||||
print(classifier.check(text))
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
|
@ -0,0 +1 @@
|
|||
spacy
|
26
setup.py
Normal file
26
setup.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
from distutils.core import setup
|
||||
setup(
|
||||
name = 'gptc', # How you named your package folder (MyLib)
|
||||
packages = ['gptc'], # Chose the same as "name"
|
||||
version = '0.0.0', # Start with a small number and increase it with every change you make
|
||||
license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository
|
||||
description = 'General-purpose English text classifier', # Give a short description about your library
|
||||
author = 'ScoopGracie', # Type in your name
|
||||
author_email = 'scoopgracie@scoopgracie.com', # Type in your E-Mail
|
||||
url = 'https://github.com/scoopgracie/gptc', # Provide either the link to your github or to your website
|
||||
keywords = ['nlp', 'text', 'classification'], # Keywords that define your package best
|
||||
install_requires=[ # I get to this in a second
|
||||
'spacy',
|
||||
],
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
|
||||
'Intended Audience :: Developers', # Define that your audience are developers
|
||||
'License :: OSI Approved :: MIT License', # Again, pick a license
|
||||
'Programming Language :: Python :: 3', #Specify which pyhton versions that you want to support
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
],
|
||||
)
|
1
twain_shakespeare.json
Normal file
1
twain_shakespeare.json
Normal file
File diff suppressed because one or more lines are too long
1026
twain_shakespeare_raw.json
Normal file
1026
twain_shakespeare_raw.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user