diff --git a/MANIFEST b/MANIFEST index cf8ce4d..af34275 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,3 +1,5 @@ # file GENERATED by distutils, do NOT edit +README setup.py gptc/__init__.py +gptc/__main__.py diff --git a/README.md b/README similarity index 100% rename from README.md rename to README diff --git a/build/lib/gptc/__init__.py b/build/lib/gptc/__init__.py new file mode 100644 index 0000000..061c263 --- /dev/null +++ b/build/lib/gptc/__init__.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +import sys +import spacy + +nlp = spacy.load('en_core_web_sm') + +def listify(text): + return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'] + + +def compile(raw_model): + model = {} + + for portion in raw_model: + text = listify(portion['text']) + category = portion['category'] + for word in text: + try: + model[category].append(word) + except: + model[category] = [word] + model[category].sort() + all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ] + for test_model in all_models: + correct = 0 + classifier = Classifier(test_model) + for text in raw_model: + if classifier.check(text['text']) == text['category']: + correct += 1 + test_model['correct'] = correct + print('tested a model') + best = all_models[0] + for test_model in all_models: + if test_model['correct'] > best['correct']: + best = test_model + del best['correct'] + return best + return {'text': model} + + +class Classifier: + def __init__(self, model, supress_uncompiled_model_warning=False): + if type(model['text']) == dict: + self.model = model + else: + self.model = compile(model) + if not supress_uncompiled_model_warning: + print('WARNING: model was not compiled', file=sys.stderr) + print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr) + self.warn = supress_uncompiled_model_warning + + def check(self, text): + model = self.model + stopword_value = 0.5 + try: + stopword_value = model['stopword'] + except: + pass + stopwords = spacy.lang.en.stop_words.STOP_WORDS + model = model['text'] + text = listify(text) + probs = {} + for word in text: + for category in model.keys(): + for catword in model[category]: + if word == catword: + weight = ( stopword_value if word in stopwords else 1 ) / len(model[category]) + try: + probs[category] += weight + except: + probs[category] = weight + most_likely = ['unknown', 0] + for category in probs.keys(): + if probs[category] > most_likely[1]: + most_likely = [category, probs[category]] + return most_likely[0] diff --git a/build/lib/gptc/__main__.py b/build/lib/gptc/__main__.py new file mode 100644 index 0000000..ac04aa7 --- /dev/null +++ b/build/lib/gptc/__main__.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import argparse +import json + +parser = argparse.ArgumentParser(description="General Purpose Text Classifier") +parser.add_argument('model', help='model to use') +parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') +args = parser.parse_args() + +import gptc # PEP 8 violation, but don't fix it +# Way better for performance of argparse checking + +with open(args.model, 'r') as f: + raw_model = json.load(f) +if args.compile: + with open(args.compile, 'w+') as f: + json.dump(gptc.compile(raw_model), f) +else: + classifier = gptc.Classifier(raw_model) + if sys.stdin.isatty(): + text = input('Text to analyse: ') + else: + text = sys.stdin.read() + print(classifier.check(text)) diff --git a/dist/gptc-0.0.1.tar.gz b/dist/gptc-0.0.1.tar.gz new file mode 100644 index 0000000..926e051 Binary files /dev/null and b/dist/gptc-0.0.1.tar.gz differ diff --git a/gptc/__init__.py b/gptc/__init__.py index 43859a8..061c263 100755 --- a/gptc/__init__.py +++ b/gptc/__init__.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 import sys -import json import spacy -import argparse nlp = spacy.load('en_core_web_sm') @@ -76,22 +74,3 @@ class Classifier: if probs[category] > most_likely[1]: most_likely = [category, probs[category]] return most_likely[0] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="General Purpose Text Classifier") - parser.add_argument('model', help='model to use') - parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') - args = parser.parse_args() - with open(args.model, 'r') as f: - raw_model = json.load(f) - if args.compile: - with open(args.compile, 'w+') as f: - json.dump(compile(raw_model), f) - else: - classifier = Classifier(raw_model) - if sys.stdin.isatty(): - text = input('Text to analyse: ') - else: - text = sys.stdin.read() - print(classifier.check(text)) diff --git a/gptc/__main__.py b/gptc/__main__.py new file mode 100644 index 0000000..ac04aa7 --- /dev/null +++ b/gptc/__main__.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import argparse +import json + +parser = argparse.ArgumentParser(description="General Purpose Text Classifier") +parser.add_argument('model', help='model to use') +parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') +args = parser.parse_args() + +import gptc # PEP 8 violation, but don't fix it +# Way better for performance of argparse checking + +with open(args.model, 'r') as f: + raw_model = json.load(f) +if args.compile: + with open(args.compile, 'w+') as f: + json.dump(gptc.compile(raw_model), f) +else: + classifier = gptc.Classifier(raw_model) + if sys.stdin.isatty(): + text = input('Text to analyse: ') + else: + text = sys.stdin.read() + print(classifier.check(text)) diff --git a/setup.py b/setup.py index 4c3c988..c05da85 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from distutils.core import setup setup( name = 'gptc', # How you named your package folder (MyLib) packages = ['gptc'], # Chose the same as "name" - version = '0.0.0', # Start with a small number and increase it with every change you make + version = '0.0.1', # Start with a small number and increase it with every change you make license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository description = 'General-purpose English text classifier', # Give a short description about your library author = 'ScoopGracie', # Type in your name