Setup

2020-03-16 10:57:15 -07:00 · 2020-03-16 10:57:15 -07:00 · dae17ebcf6
commit dae17ebcf6
9 changed files with 1214 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 __pycache__
 *.swp
--- a/3
+++ b/3
@ -0,0 +1,3 @@
 # file GENERATED by distutils, do NOT edit
 setup.py
 gptc/__init__.py
--- a/README.md
+++ b/README.md
@ -0,0 +1,58 @@
 # GPTC
 General-purpose text classifier in Python
 ## CLI Tool
 If you just want to do some simple classification on the command line, use the
 CLI tool. To use an existing model, <!-- When initialising a Classifier
 object, pass in the keyword argument `supress_uncompiled_model_warning=True`.
 -->use `gptc <modelfile>`. It will prompt for a string, and classify it,
 outputting the category on stdout (or "unknown" if it cannot determine
 anything) See "Model format" for a description of the model. To compile a
 model, use `gptc <rawmodelfile> -c|--compile <compiledmodelfile>`.
 ## Library
 If you want to use GPTC programmatically, use the library.
 ### `gptc.Classifier(model)`
 Create a `Classifier` object using the given model (as a Python list/dict, not
 as JSON). If the model is raw (a list), it will print a big warning on stderr.
 ### `Classifier.check(text)`
 Classify `text` with GPTC using the model used to instantiate the
 `Classifier`. Returns the category into which the text is placed (as a
 string), or `'unknown'` when it cannot classify the text.
 ## Model format
 Since you never really need to mess with compiled models, I won't discuss
 them. You can read the code if you really need to figure them out.
 This section explains the raw model format, which is how you should create and
 edit models.
 Raw models are formatted as a list of dicts. See below for the format:
    [
        {
            "text": "<text in the category>",
            "category": "<the category>"
        }
    ]
 Although GPTC handles models as Python lists (for raw models) or dicts (for
 compiled models), I recommend storing them in JSON format, mainly because the
 command-line tool uses JSON.
 You can use a raw model anywhere you can use a compiled model. However, both
 the library and the CLI tool will print a big warning to stderr if you do
 this. There is a comment in a random place in this document explaining how to
 disable this in the library. (It's in a comment so you can't do it without
 some effort. The warning cannot be disabled in the CLI program without hacking
 the source.
 ## Example models
 I provide an example model trained to distinguish between texts written by
 Mark Twain and those written by William Shakespeare. I chose them because
 their works have all gone into the public domain, and their writing style is
 so different that GPTC can easily tell the difference, making it a good
 demonstration.
 The raw model is in `twain_shakespeare_raw.json`; the compiled model is in
 `twain_shakespeare.json`.
--- a/dist/gptc-0.0.0.tar.gz
+++ b/dist/gptc-0.0.0.tar.gz
--- a/gptc/init.py
+++ b/gptc/init.py
@ -0,0 +1,97 @@
 #!/usr/bin/env python3
 import sys
 import json
 import spacy
 import argparse
 nlp = spacy.load('en_core_web_sm')
 def listify(text):
    return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ']
 def compile(raw_model):
    model = {}
    for portion in raw_model:
        text = listify(portion['text'])
        category = portion['category']
        for word in text:
            try:
                model[category].append(word)
            except:
                model[category] = [word]
            model[category].sort()
    all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ]
    for test_model in all_models:
        correct = 0
        classifier = Classifier(test_model)
        for text in raw_model:
            if classifier.check(text['text']) == text['category']:
                correct += 1
        test_model['correct'] = correct
        print('tested a model')
    best = all_models[0]
    for test_model in all_models:
        if test_model['correct'] > best['correct']:
            best = test_model
    del best['correct']
    return best
    return {'text': model}
 class Classifier:
    def __init__(self, model, supress_uncompiled_model_warning=False):
        if type(model['text']) == dict:
            self.model = model
        else:
            self.model = compile(model)
            if not supress_uncompiled_model_warning:
                print('WARNING: model was not compiled', file=sys.stderr)
                print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr)
        self.warn = supress_uncompiled_model_warning
    def check(self, text):
        model = self.model
        stopword_value = 0.5
        try:
            stopword_value = model['stopword']
        except:
            pass
        stopwords = spacy.lang.en.stop_words.STOP_WORDS
        model = model['text']
        text = listify(text)
        probs = {}
        for word in text:
            for category in model.keys():
                for catword in model[category]:
                    if word == catword:
                        weight = ( stopword_value if word in stopwords else 1 ) / len(model[category])
                        try:
                            probs[category] += weight 
                        except:
                            probs[category] = weight
        most_likely = ['unknown', 0]
        for category in probs.keys():
            if probs[category] > most_likely[1]:
                most_likely = [category, probs[category]]
        return most_likely[0]
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
    parser.add_argument('model', help='model to use')
    parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile')
    args = parser.parse_args()
    with open(args.model, 'r') as f:
        raw_model = json.load(f)
    if args.compile:
        with open(args.compile, 'w+') as f:
            json.dump(compile(raw_model), f)
    else:
        classifier = Classifier(raw_model)
        if sys.stdin.isatty():
            text = input('Text to analyse: ')
        else:
            text = sys.stdin.read()
        print(classifier.check(text))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
 spacy
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,26 @@
 from distutils.core import setup
 setup(
  name = 'gptc',         # How you named your package folder (MyLib)
  packages = ['gptc'],   # Chose the same as "name"
  version = '0.0.0',      # Start with a small number and increase it with every change you make
  license='MIT',        # Chose a license from here: https://help.github.com/articles/licensing-a-repository
  description = 'General-purpose English text classifier',   # Give a short description about your library
  author = 'ScoopGracie',                   # Type in your name
  author_email = 'scoopgracie@scoopgracie.com',      # Type in your E-Mail
  url = 'https://github.com/scoopgracie/gptc',   # Provide either the link to your github or to your website
  keywords = ['nlp', 'text', 'classification'],   # Keywords that define your package best
  install_requires=[            # I get to this in a second
          'spacy',
      ],
  classifiers=[
    'Development Status :: 4 - Beta',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
    'Intended Audience :: Developers',      # Define that your audience are developers
    'License :: OSI Approved :: MIT License',   # Again, pick a license
    'Programming Language :: Python :: 3',      #Specify which pyhton versions that you want to support
    'Programming Language :: Python :: 3.5',
    'Programming Language :: Python :: 3.6',
    'Programming Language :: Python :: 3.7',
    'Programming Language :: Python :: 3.8',
    'Programming Language :: Python :: 3.9',
  ],
 )
--- a/twain_shakespeare.json
+++ b/twain_shakespeare.json
--- a/twain_shakespeare_raw.json
+++ b/twain_shakespeare_raw.json