scoopgracie
4 years ago
6 changed files with 138 additions and 108 deletions
@ -1,3 +1,138 @@
|
||||
__pycache__ |
||||
*.swp |
||||
venv |
||||
# Byte-compiled / optimized / DLL files |
||||
__pycache__/ |
||||
*.py[cod] |
||||
*$py.class |
||||
|
||||
# C extensions |
||||
*.so |
||||
|
||||
# Distribution / packaging |
||||
.Python |
||||
build/ |
||||
develop-eggs/ |
||||
dist/ |
||||
downloads/ |
||||
eggs/ |
||||
.eggs/ |
||||
lib/ |
||||
lib64/ |
||||
parts/ |
||||
sdist/ |
||||
var/ |
||||
wheels/ |
||||
share/python-wheels/ |
||||
*.egg-info/ |
||||
.installed.cfg |
||||
*.egg |
||||
MANIFEST |
||||
|
||||
# PyInstaller |
||||
# Usually these files are written by a python script from a template |
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it. |
||||
*.manifest |
||||
*.spec |
||||
|
||||
# Installer logs |
||||
pip-log.txt |
||||
pip-delete-this-directory.txt |
||||
|
||||
# Unit test / coverage reports |
||||
htmlcov/ |
||||
.tox/ |
||||
.nox/ |
||||
.coverage |
||||
.coverage.* |
||||
.cache |
||||
nosetests.xml |
||||
coverage.xml |
||||
*.cover |
||||
*.py,cover |
||||
.hypothesis/ |
||||
.pytest_cache/ |
||||
cover/ |
||||
|
||||
# Translations |
||||
*.mo |
||||
*.pot |
||||
|
||||
# Django stuff: |
||||
*.log |
||||
local_settings.py |
||||
db.sqlite3 |
||||
db.sqlite3-journal |
||||
|
||||
# Flask stuff: |
||||
instance/ |
||||
.webassets-cache |
||||
|
||||
# Scrapy stuff: |
||||
.scrapy |
||||
|
||||
# Sphinx documentation |
||||
docs/_build/ |
||||
|
||||
# PyBuilder |
||||
.pybuilder/ |
||||
target/ |
||||
|
||||
# Jupyter Notebook |
||||
.ipynb_checkpoints |
||||
|
||||
# IPython |
||||
profile_default/ |
||||
ipython_config.py |
||||
|
||||
# pyenv |
||||
# For a library or package, you might want to ignore these files since the code is |
||||
# intended to run in multiple environments; otherwise, check them in: |
||||
# .python-version |
||||
|
||||
# pipenv |
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. |
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies |
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not |
||||
# install all needed dependencies. |
||||
#Pipfile.lock |
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow |
||||
__pypackages__/ |
||||
|
||||
# Celery stuff |
||||
celerybeat-schedule |
||||
celerybeat.pid |
||||
|
||||
# SageMath parsed files |
||||
*.sage.py |
||||
|
||||
# Environments |
||||
.env |
||||
.venv |
||||
env/ |
||||
venv/ |
||||
ENV/ |
||||
env.bak/ |
||||
venv.bak/ |
||||
|
||||
# Spyder project settings |
||||
.spyderproject |
||||
.spyproject |
||||
|
||||
# Rope project settings |
||||
.ropeproject |
||||
|
||||
# mkdocs documentation |
||||
/site |
||||
|
||||
# mypy |
||||
.mypy_cache/ |
||||
.dmypy.json |
||||
dmypy.json |
||||
|
||||
# Pyre type checker |
||||
.pyre/ |
||||
|
||||
# pytype static type analyzer |
||||
.pytype/ |
||||
|
||||
# Cython debug symbols |
||||
cython_debug/ |
||||
|
@ -1,5 +0,0 @@
|
||||
# file GENERATED by distutils, do NOT edit |
||||
README |
||||
setup.py |
||||
gptc/__init__.py |
||||
gptc/__main__.py |
@ -1,76 +0,0 @@
|
||||
#!/usr/bin/env python3 |
||||
import sys |
||||
import spacy |
||||
|
||||
nlp = spacy.load('en_core_web_sm') |
||||
|
||||
def listify(text): |
||||
return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'] |
||||
|
||||
|
||||
def compile(raw_model): |
||||
model = {} |
||||
|
||||
for portion in raw_model: |
||||
text = listify(portion['text']) |
||||
category = portion['category'] |
||||
for word in text: |
||||
try: |
||||
model[category].append(word) |
||||
except: |
||||
model[category] = [word] |
||||
model[category].sort() |
||||
all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ] |
||||
for test_model in all_models: |
||||
correct = 0 |
||||
classifier = Classifier(test_model) |
||||
for text in raw_model: |
||||
if classifier.check(text['text']) == text['category']: |
||||
correct += 1 |
||||
test_model['correct'] = correct |
||||
print('tested a model') |
||||
best = all_models[0] |
||||
for test_model in all_models: |
||||
if test_model['correct'] > best['correct']: |
||||
best = test_model |
||||
del best['correct'] |
||||
return best |
||||
return {'text': model} |
||||
|
||||
|
||||
class Classifier: |
||||
def __init__(self, model, supress_uncompiled_model_warning=False): |
||||
if type(model['text']) == dict: |
||||
self.model = model |
||||
else: |
||||
self.model = compile(model) |
||||
if not supress_uncompiled_model_warning: |
||||
print('WARNING: model was not compiled', file=sys.stderr) |
||||
print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr) |
||||
self.warn = supress_uncompiled_model_warning |
||||
|
||||
def check(self, text): |
||||
model = self.model |
||||
stopword_value = 0.5 |
||||
try: |
||||
stopword_value = model['stopword'] |
||||
except: |
||||
pass |
||||
stopwords = spacy.lang.en.stop_words.STOP_WORDS |
||||
model = model['text'] |
||||
text = listify(text) |
||||
probs = {} |
||||
for word in text: |
||||
for category in model.keys(): |
||||
for catword in model[category]: |
||||
if word == catword: |
||||
weight = ( stopword_value if word in stopwords else 1 ) / len(model[category]) |
||||
try: |
||||
probs[category] += weight |
||||
except: |
||||
probs[category] = weight |
||||
most_likely = ['unknown', 0] |
||||
for category in probs.keys(): |
||||
if probs[category] > most_likely[1]: |
||||
most_likely = [category, probs[category]] |
||||
return most_likely[0] |
@ -1,24 +0,0 @@
|
||||
#!/usr/bin/env python3 |
||||
import argparse |
||||
import json |
||||
|
||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier") |
||||
parser.add_argument('model', help='model to use') |
||||
parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') |
||||
args = parser.parse_args() |
||||
|
||||
import gptc # PEP 8 violation, but don't fix it |
||||
# Way better for performance of argparse checking |
||||
|
||||
with open(args.model, 'r') as f: |
||||
raw_model = json.load(f) |
||||
if args.compile: |
||||
with open(args.compile, 'w+') as f: |
||||
json.dump(gptc.compile(raw_model), f) |
||||
else: |
||||
classifier = gptc.Classifier(raw_model) |
||||
if sys.stdin.isatty(): |
||||
text = input('Text to analyse: ') |
||||
else: |
||||
text = sys.stdin.read() |
||||
print(classifier.check(text)) |
Binary file not shown.
Binary file not shown.
Loading…
Reference in new issue