Setup

2020-03-16 10:57:15 -07:00 · 2020-03-16 10:57:15 -07:00 · dae17ebcf6
commit dae17ebcf6
9 changed files with 1214 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+__pycache__
+*.swp
--- a/3
+++ b/3
@ -0,0 +1,3 @@
+# file GENERATED by distutils, do NOT edit
+setup.py
+gptc/__init__.py
--- a/README.md
+++ b/README.md
@ -0,0 +1,58 @@
+# GPTC
+General-purpose text classifier in Python
+
+## CLI Tool
+If you just want to do some simple classification on the command line, use the
+CLI tool. To use an existing model, <!-- When initialising a Classifier
+object, pass in the keyword argument `supress_uncompiled_model_warning=True`.
+-->use `gptc <modelfile>`. It will prompt for a string, and classify it,
+outputting the category on stdout (or "unknown" if it cannot determine
+anything) See "Model format" for a description of the model. To compile a
+model, use `gptc <rawmodelfile> -c|--compile <compiledmodelfile>`.
+
+## Library
+If you want to use GPTC programmatically, use the library.
+### `gptc.Classifier(model)`
+Create a `Classifier` object using the given model (as a Python list/dict, not
+as JSON). If the model is raw (a list), it will print a big warning on stderr.
+### `Classifier.check(text)`
+Classify `text` with GPTC using the model used to instantiate the
+`Classifier`. Returns the category into which the text is placed (as a
+string), or `'unknown'` when it cannot classify the text.
+
+## Model format
+Since you never really need to mess with compiled models, I won't discuss
+them. You can read the code if you really need to figure them out.
+
+This section explains the raw model format, which is how you should create and
+edit models.
+
+Raw models are formatted as a list of dicts. See below for the format:
+
+    [
+        {
+            "text": "<text in the category>",
+            "category": "<the category>"
+        }
+    ]
+
+Although GPTC handles models as Python lists (for raw models) or dicts (for
+compiled models), I recommend storing them in JSON format, mainly because the
+command-line tool uses JSON.
+
+You can use a raw model anywhere you can use a compiled model. However, both
+the library and the CLI tool will print a big warning to stderr if you do
+this. There is a comment in a random place in this document explaining how to
+disable this in the library. (It's in a comment so you can't do it without
+some effort. The warning cannot be disabled in the CLI program without hacking
+the source.
+
+## Example models
+I provide an example model trained to distinguish between texts written by
+Mark Twain and those written by William Shakespeare. I chose them because
+their works have all gone into the public domain, and their writing style is
+so different that GPTC can easily tell the difference, making it a good
+demonstration.
+
+The raw model is in `twain_shakespeare_raw.json`; the compiled model is in
+`twain_shakespeare.json`.
--- a/dist/gptc-0.0.0.tar.gz
+++ b/dist/gptc-0.0.0.tar.gz
--- a/gptc/init.py
+++ b/gptc/init.py
@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+import sys
+import json
+import spacy
+import argparse
+
+nlp = spacy.load('en_core_web_sm')
+
+def listify(text):
+    return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ']
+
+
+def compile(raw_model):
+    model = {}
+
+    for portion in raw_model:
+        text = listify(portion['text'])
+        category = portion['category']
+        for word in text:
+            try:
+                model[category].append(word)
+            except:
+                model[category] = [word]
+            model[category].sort()
+    all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ]
+    for test_model in all_models:
+        correct = 0
+        classifier = Classifier(test_model)
+        for text in raw_model:
+            if classifier.check(text['text']) == text['category']:
+                correct += 1
+        test_model['correct'] = correct
+        print('tested a model')
+    best = all_models[0]
+    for test_model in all_models:
+        if test_model['correct'] > best['correct']:
+            best = test_model
+    del best['correct']
+    return best
+    return {'text': model}
+
+
+class Classifier:
+    def __init__(self, model, supress_uncompiled_model_warning=False):
+        if type(model['text']) == dict:
+            self.model = model
+        else:
+            self.model = compile(model)
+            if not supress_uncompiled_model_warning:
+                print('WARNING: model was not compiled', file=sys.stderr)
+                print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr)
+        self.warn = supress_uncompiled_model_warning
+
+    def check(self, text):
+        model = self.model
+        stopword_value = 0.5
+        try:
+            stopword_value = model['stopword']
+        except:
+            pass
+        stopwords = spacy.lang.en.stop_words.STOP_WORDS
+        model = model['text']
+        text = listify(text)
+        probs = {}
+        for word in text:
+            for category in model.keys():
+                for catword in model[category]:
+                    if word == catword:
+                        weight = ( stopword_value if word in stopwords else 1 ) / len(model[category])
+                        try:
+                            probs[category] += weight 
+                        except:
+                            probs[category] = weight
+        most_likely = ['unknown', 0]
+        for category in probs.keys():
+            if probs[category] > most_likely[1]:
+                most_likely = [category, probs[category]]
+        return most_likely[0]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
+    parser.add_argument('model', help='model to use')
+    parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
+    args = parser.parse_args()
+    with open(args.model, 'r') as f:
+        raw_model = json.load(f)
+    if args.compile:
+        with open(args.compile, 'w+') as f:
+            json.dump(compile(raw_model), f)
+    else:
+        classifier = Classifier(raw_model)
+        if sys.stdin.isatty():
+            text = input('Text to analyse: ')
+        else:
+            text = sys.stdin.read()
+        print(classifier.check(text))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+spacy
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,26 @@
+from distutils.core import setup
+setup(
+  name = 'gptc',         # How you named your package folder (MyLib)
+  packages = ['gptc'],   # Chose the same as "name"
+  version = '0.0.0',      # Start with a small number and increase it with every change you make
+  license='MIT',        # Chose a license from here: https://help.github.com/articles/licensing-a-repository
+  description = 'General-purpose English text classifier',   # Give a short description about your library
+  author = 'ScoopGracie',                   # Type in your name
+  author_email = 'scoopgracie@scoopgracie.com',      # Type in your E-Mail
+  url = 'https://github.com/scoopgracie/gptc',   # Provide either the link to your github or to your website
+  keywords = ['nlp', 'text', 'classification'],   # Keywords that define your package best
+  install_requires=[            # I get to this in a second
+          'spacy',
+      ],
+  classifiers=[
+    'Development Status :: 4 - Beta',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
+    'Intended Audience :: Developers',      # Define that your audience are developers
+    'License :: OSI Approved :: MIT License',   # Again, pick a license
+    'Programming Language :: Python :: 3',      #Specify which pyhton versions that you want to support
+    'Programming Language :: Python :: 3.5',
+    'Programming Language :: Python :: 3.6',
+    'Programming Language :: Python :: 3.7',
+    'Programming Language :: Python :: 3.8',
+    'Programming Language :: Python :: 3.9',
+  ],
+)
--- a/twain_shakespeare.json
+++ b/twain_shakespeare.json
--- a/twain_shakespeare_raw.json
+++ b/twain_shakespeare_raw.json