scoopgracie
4 years ago
6 changed files with 138 additions and 108 deletions
@ -1,3 +1,138 @@ |
|||||||
__pycache__ |
# Byte-compiled / optimized / DLL files |
||||||
*.swp |
__pycache__/ |
||||||
venv |
*.py[cod] |
||||||
|
*$py.class |
||||||
|
|
||||||
|
# C extensions |
||||||
|
*.so |
||||||
|
|
||||||
|
# Distribution / packaging |
||||||
|
.Python |
||||||
|
build/ |
||||||
|
develop-eggs/ |
||||||
|
dist/ |
||||||
|
downloads/ |
||||||
|
eggs/ |
||||||
|
.eggs/ |
||||||
|
lib/ |
||||||
|
lib64/ |
||||||
|
parts/ |
||||||
|
sdist/ |
||||||
|
var/ |
||||||
|
wheels/ |
||||||
|
share/python-wheels/ |
||||||
|
*.egg-info/ |
||||||
|
.installed.cfg |
||||||
|
*.egg |
||||||
|
MANIFEST |
||||||
|
|
||||||
|
# PyInstaller |
||||||
|
# Usually these files are written by a python script from a template |
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||||
|
*.manifest |
||||||
|
*.spec |
||||||
|
|
||||||
|
# Installer logs |
||||||
|
pip-log.txt |
||||||
|
pip-delete-this-directory.txt |
||||||
|
|
||||||
|
# Unit test / coverage reports |
||||||
|
htmlcov/ |
||||||
|
.tox/ |
||||||
|
.nox/ |
||||||
|
.coverage |
||||||
|
.coverage.* |
||||||
|
.cache |
||||||
|
nosetests.xml |
||||||
|
coverage.xml |
||||||
|
*.cover |
||||||
|
*.py,cover |
||||||
|
.hypothesis/ |
||||||
|
.pytest_cache/ |
||||||
|
cover/ |
||||||
|
|
||||||
|
# Translations |
||||||
|
*.mo |
||||||
|
*.pot |
||||||
|
|
||||||
|
# Django stuff: |
||||||
|
*.log |
||||||
|
local_settings.py |
||||||
|
db.sqlite3 |
||||||
|
db.sqlite3-journal |
||||||
|
|
||||||
|
# Flask stuff: |
||||||
|
instance/ |
||||||
|
.webassets-cache |
||||||
|
|
||||||
|
# Scrapy stuff: |
||||||
|
.scrapy |
||||||
|
|
||||||
|
# Sphinx documentation |
||||||
|
docs/_build/ |
||||||
|
|
||||||
|
# PyBuilder |
||||||
|
.pybuilder/ |
||||||
|
target/ |
||||||
|
|
||||||
|
# Jupyter Notebook |
||||||
|
.ipynb_checkpoints |
||||||
|
|
||||||
|
# IPython |
||||||
|
profile_default/ |
||||||
|
ipython_config.py |
||||||
|
|
||||||
|
# pyenv |
||||||
|
# For a library or package, you might want to ignore these files since the code is |
||||||
|
# intended to run in multiple environments; otherwise, check them in: |
||||||
|
# .python-version |
||||||
|
|
||||||
|
# pipenv |
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||||
|
# install all needed dependencies. |
||||||
|
#Pipfile.lock |
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow |
||||||
|
__pypackages__/ |
||||||
|
|
||||||
|
# Celery stuff |
||||||
|
celerybeat-schedule |
||||||
|
celerybeat.pid |
||||||
|
|
||||||
|
# SageMath parsed files |
||||||
|
*.sage.py |
||||||
|
|
||||||
|
# Environments |
||||||
|
.env |
||||||
|
.venv |
||||||
|
env/ |
||||||
|
venv/ |
||||||
|
ENV/ |
||||||
|
env.bak/ |
||||||
|
venv.bak/ |
||||||
|
|
||||||
|
# Spyder project settings |
||||||
|
.spyderproject |
||||||
|
.spyproject |
||||||
|
|
||||||
|
# Rope project settings |
||||||
|
.ropeproject |
||||||
|
|
||||||
|
# mkdocs documentation |
||||||
|
/site |
||||||
|
|
||||||
|
# mypy |
||||||
|
.mypy_cache/ |
||||||
|
.dmypy.json |
||||||
|
dmypy.json |
||||||
|
|
||||||
|
# Pyre type checker |
||||||
|
.pyre/ |
||||||
|
|
||||||
|
# pytype static type analyzer |
||||||
|
.pytype/ |
||||||
|
|
||||||
|
# Cython debug symbols |
||||||
|
cython_debug/ |
||||||
|
@ -1,5 +0,0 @@ |
|||||||
# file GENERATED by distutils, do NOT edit |
|
||||||
README |
|
||||||
setup.py |
|
||||||
gptc/__init__.py |
|
||||||
gptc/__main__.py |
|
@ -1,76 +0,0 @@ |
|||||||
#!/usr/bin/env python3 |
|
||||||
import sys |
|
||||||
import spacy |
|
||||||
|
|
||||||
nlp = spacy.load('en_core_web_sm') |
|
||||||
|
|
||||||
def listify(text): |
|
||||||
return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'] |
|
||||||
|
|
||||||
|
|
||||||
def compile(raw_model): |
|
||||||
model = {} |
|
||||||
|
|
||||||
for portion in raw_model: |
|
||||||
text = listify(portion['text']) |
|
||||||
category = portion['category'] |
|
||||||
for word in text: |
|
||||||
try: |
|
||||||
model[category].append(word) |
|
||||||
except: |
|
||||||
model[category] = [word] |
|
||||||
model[category].sort() |
|
||||||
all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ] |
|
||||||
for test_model in all_models: |
|
||||||
correct = 0 |
|
||||||
classifier = Classifier(test_model) |
|
||||||
for text in raw_model: |
|
||||||
if classifier.check(text['text']) == text['category']: |
|
||||||
correct += 1 |
|
||||||
test_model['correct'] = correct |
|
||||||
print('tested a model') |
|
||||||
best = all_models[0] |
|
||||||
for test_model in all_models: |
|
||||||
if test_model['correct'] > best['correct']: |
|
||||||
best = test_model |
|
||||||
del best['correct'] |
|
||||||
return best |
|
||||||
return {'text': model} |
|
||||||
|
|
||||||
|
|
||||||
class Classifier: |
|
||||||
def __init__(self, model, supress_uncompiled_model_warning=False): |
|
||||||
if type(model['text']) == dict: |
|
||||||
self.model = model |
|
||||||
else: |
|
||||||
self.model = compile(model) |
|
||||||
if not supress_uncompiled_model_warning: |
|
||||||
print('WARNING: model was not compiled', file=sys.stderr) |
|
||||||
print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr) |
|
||||||
self.warn = supress_uncompiled_model_warning |
|
||||||
|
|
||||||
def check(self, text): |
|
||||||
model = self.model |
|
||||||
stopword_value = 0.5 |
|
||||||
try: |
|
||||||
stopword_value = model['stopword'] |
|
||||||
except: |
|
||||||
pass |
|
||||||
stopwords = spacy.lang.en.stop_words.STOP_WORDS |
|
||||||
model = model['text'] |
|
||||||
text = listify(text) |
|
||||||
probs = {} |
|
||||||
for word in text: |
|
||||||
for category in model.keys(): |
|
||||||
for catword in model[category]: |
|
||||||
if word == catword: |
|
||||||
weight = ( stopword_value if word in stopwords else 1 ) / len(model[category]) |
|
||||||
try: |
|
||||||
probs[category] += weight |
|
||||||
except: |
|
||||||
probs[category] = weight |
|
||||||
most_likely = ['unknown', 0] |
|
||||||
for category in probs.keys(): |
|
||||||
if probs[category] > most_likely[1]: |
|
||||||
most_likely = [category, probs[category]] |
|
||||||
return most_likely[0] |
|
@ -1,24 +0,0 @@ |
|||||||
#!/usr/bin/env python3 |
|
||||||
import argparse |
|
||||||
import json |
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier") |
|
||||||
parser.add_argument('model', help='model to use') |
|
||||||
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') |
|
||||||
args = parser.parse_args() |
|
||||||
|
|
||||||
import gptc # PEP 8 violation, but don't fix it |
|
||||||
# Way better for performance of argparse checking |
|
||||||
|
|
||||||
with open(args.model, 'r') as f: |
|
||||||
raw_model = json.load(f) |
|
||||||
if args.compile: |
|
||||||
with open(args.compile, 'w+') as f: |
|
||||||
json.dump(gptc.compile(raw_model), f) |
|
||||||
else: |
|
||||||
classifier = gptc.Classifier(raw_model) |
|
||||||
if sys.stdin.isatty(): |
|
||||||
text = input('Text to analyse: ') |
|
||||||
else: |
|
||||||
text = sys.stdin.read() |
|
||||||
print(classifier.check(text)) |
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue