Bugfix release
This commit is contained in:
parent
1ca354c7f5
commit
11b3785f84
2
MANIFEST
2
MANIFEST
|
@ -1,3 +1,5 @@
|
||||||
# file GENERATED by distutils, do NOT edit
|
# file GENERATED by distutils, do NOT edit
|
||||||
|
README
|
||||||
setup.py
|
setup.py
|
||||||
gptc/__init__.py
|
gptc/__init__.py
|
||||||
|
gptc/__main__.py
|
||||||
|
|
76
build/lib/gptc/__init__.py
Normal file
76
build/lib/gptc/__init__.py
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
|
||||||
|
def listify(text):
|
||||||
|
return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ']
|
||||||
|
|
||||||
|
|
||||||
|
def compile(raw_model):
|
||||||
|
model = {}
|
||||||
|
|
||||||
|
for portion in raw_model:
|
||||||
|
text = listify(portion['text'])
|
||||||
|
category = portion['category']
|
||||||
|
for word in text:
|
||||||
|
try:
|
||||||
|
model[category].append(word)
|
||||||
|
except:
|
||||||
|
model[category] = [word]
|
||||||
|
model[category].sort()
|
||||||
|
all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ]
|
||||||
|
for test_model in all_models:
|
||||||
|
correct = 0
|
||||||
|
classifier = Classifier(test_model)
|
||||||
|
for text in raw_model:
|
||||||
|
if classifier.check(text['text']) == text['category']:
|
||||||
|
correct += 1
|
||||||
|
test_model['correct'] = correct
|
||||||
|
print('tested a model')
|
||||||
|
best = all_models[0]
|
||||||
|
for test_model in all_models:
|
||||||
|
if test_model['correct'] > best['correct']:
|
||||||
|
best = test_model
|
||||||
|
del best['correct']
|
||||||
|
return best
|
||||||
|
return {'text': model}
|
||||||
|
|
||||||
|
|
||||||
|
class Classifier:
|
||||||
|
def __init__(self, model, supress_uncompiled_model_warning=False):
|
||||||
|
if type(model['text']) == dict:
|
||||||
|
self.model = model
|
||||||
|
else:
|
||||||
|
self.model = compile(model)
|
||||||
|
if not supress_uncompiled_model_warning:
|
||||||
|
print('WARNING: model was not compiled', file=sys.stderr)
|
||||||
|
print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr)
|
||||||
|
self.warn = supress_uncompiled_model_warning
|
||||||
|
|
||||||
|
def check(self, text):
|
||||||
|
model = self.model
|
||||||
|
stopword_value = 0.5
|
||||||
|
try:
|
||||||
|
stopword_value = model['stopword']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
stopwords = spacy.lang.en.stop_words.STOP_WORDS
|
||||||
|
model = model['text']
|
||||||
|
text = listify(text)
|
||||||
|
probs = {}
|
||||||
|
for word in text:
|
||||||
|
for category in model.keys():
|
||||||
|
for catword in model[category]:
|
||||||
|
if word == catword:
|
||||||
|
weight = ( stopword_value if word in stopwords else 1 ) / len(model[category])
|
||||||
|
try:
|
||||||
|
probs[category] += weight
|
||||||
|
except:
|
||||||
|
probs[category] = weight
|
||||||
|
most_likely = ['unknown', 0]
|
||||||
|
for category in probs.keys():
|
||||||
|
if probs[category] > most_likely[1]:
|
||||||
|
most_likely = [category, probs[category]]
|
||||||
|
return most_likely[0]
|
24
build/lib/gptc/__main__.py
Normal file
24
build/lib/gptc/__main__.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
||||||
|
parser.add_argument('model', help='model to use')
|
||||||
|
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
import gptc # PEP 8 violation, but don't fix it
|
||||||
|
# Way better for performance of argparse checking
|
||||||
|
|
||||||
|
with open(args.model, 'r') as f:
|
||||||
|
raw_model = json.load(f)
|
||||||
|
if args.compile:
|
||||||
|
with open(args.compile, 'w+') as f:
|
||||||
|
json.dump(gptc.compile(raw_model), f)
|
||||||
|
else:
|
||||||
|
classifier = gptc.Classifier(raw_model)
|
||||||
|
if sys.stdin.isatty():
|
||||||
|
text = input('Text to analyse: ')
|
||||||
|
else:
|
||||||
|
text = sys.stdin.read()
|
||||||
|
print(classifier.check(text))
|
BIN
dist/gptc-0.0.1.tar.gz
vendored
Normal file
BIN
dist/gptc-0.0.1.tar.gz
vendored
Normal file
Binary file not shown.
|
@ -1,8 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import sys
|
import sys
|
||||||
import json
|
|
||||||
import spacy
|
import spacy
|
||||||
import argparse
|
|
||||||
|
|
||||||
nlp = spacy.load('en_core_web_sm')
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
|
||||||
|
@ -76,22 +74,3 @@ class Classifier:
|
||||||
if probs[category] > most_likely[1]:
|
if probs[category] > most_likely[1]:
|
||||||
most_likely = [category, probs[category]]
|
most_likely = [category, probs[category]]
|
||||||
return most_likely[0]
|
return most_likely[0]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
|
||||||
parser.add_argument('model', help='model to use')
|
|
||||||
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
|
|
||||||
args = parser.parse_args()
|
|
||||||
with open(args.model, 'r') as f:
|
|
||||||
raw_model = json.load(f)
|
|
||||||
if args.compile:
|
|
||||||
with open(args.compile, 'w+') as f:
|
|
||||||
json.dump(compile(raw_model), f)
|
|
||||||
else:
|
|
||||||
classifier = Classifier(raw_model)
|
|
||||||
if sys.stdin.isatty():
|
|
||||||
text = input('Text to analyse: ')
|
|
||||||
else:
|
|
||||||
text = sys.stdin.read()
|
|
||||||
print(classifier.check(text))
|
|
||||||
|
|
24
gptc/__main__.py
Normal file
24
gptc/__main__.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
||||||
|
parser.add_argument('model', help='model to use')
|
||||||
|
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
import gptc # PEP 8 violation, but don't fix it
|
||||||
|
# Way better for performance of argparse checking
|
||||||
|
|
||||||
|
with open(args.model, 'r') as f:
|
||||||
|
raw_model = json.load(f)
|
||||||
|
if args.compile:
|
||||||
|
with open(args.compile, 'w+') as f:
|
||||||
|
json.dump(gptc.compile(raw_model), f)
|
||||||
|
else:
|
||||||
|
classifier = gptc.Classifier(raw_model)
|
||||||
|
if sys.stdin.isatty():
|
||||||
|
text = input('Text to analyse: ')
|
||||||
|
else:
|
||||||
|
text = sys.stdin.read()
|
||||||
|
print(classifier.check(text))
|
2
setup.py
2
setup.py
|
@ -2,7 +2,7 @@ from distutils.core import setup
|
||||||
setup(
|
setup(
|
||||||
name = 'gptc', # How you named your package folder (MyLib)
|
name = 'gptc', # How you named your package folder (MyLib)
|
||||||
packages = ['gptc'], # Chose the same as "name"
|
packages = ['gptc'], # Chose the same as "name"
|
||||||
version = '0.0.0', # Start with a small number and increase it with every change you make
|
version = '0.0.1', # Start with a small number and increase it with every change you make
|
||||||
license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository
|
license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository
|
||||||
description = 'General-purpose English text classifier', # Give a short description about your library
|
description = 'General-purpose English text classifier', # Give a short description about your library
|
||||||
author = 'ScoopGracie', # Type in your name
|
author = 'ScoopGracie', # Type in your name
|
||||||
|
|
Loading…
Reference in New Issue
Block a user