From 11b3785f846b38dad6522502ecaff17caa88dd7c Mon Sep 17 00:00:00 2001 From: scoopgracie Date: Mon, 30 Mar 2020 09:45:26 -0700 Subject: [PATCH] Bugfix release --- MANIFEST | 2 + README.md => README | 0 build/lib/gptc/__init__.py | 76 +++++++++++++++++++++++++++++++++++++ build/lib/gptc/__main__.py | 24 ++++++++++++ dist/gptc-0.0.1.tar.gz | Bin 0 -> 3478 bytes gptc/__init__.py | 21 ---------- gptc/__main__.py | 24 ++++++++++++ setup.py | 2 +- 8 files changed, 127 insertions(+), 22 deletions(-) rename README.md => README (100%) create mode 100644 build/lib/gptc/__init__.py create mode 100644 build/lib/gptc/__main__.py create mode 100644 dist/gptc-0.0.1.tar.gz create mode 100644 gptc/__main__.py diff --git a/MANIFEST b/MANIFEST index cf8ce4d..af34275 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,3 +1,5 @@ # file GENERATED by distutils, do NOT edit +README setup.py gptc/__init__.py +gptc/__main__.py diff --git a/README.md b/README similarity index 100% rename from README.md rename to README diff --git a/build/lib/gptc/__init__.py b/build/lib/gptc/__init__.py new file mode 100644 index 0000000..061c263 --- /dev/null +++ b/build/lib/gptc/__init__.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +import sys +import spacy + +nlp = spacy.load('en_core_web_sm') + +def listify(text): + return [string.lemma_.lower() for string in nlp(text) if string.lemma_[0] in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'] + + +def compile(raw_model): + model = {} + + for portion in raw_model: + text = listify(portion['text']) + category = portion['category'] + for word in text: + try: + model[category].append(word) + except: + model[category] = [word] + model[category].sort() + all_models = [ { 'text': model, 'stopword': i/10} for i in range(0, 21) ] + for test_model in all_models: + correct = 0 + classifier = Classifier(test_model) + for text in raw_model: + if classifier.check(text['text']) == text['category']: + correct += 1 + test_model['correct'] = correct + print('tested a model') + best = all_models[0] + for test_model in all_models: + if test_model['correct'] > best['correct']: + best = test_model + del best['correct'] + return best + return {'text': model} + + +class Classifier: + def __init__(self, model, supress_uncompiled_model_warning=False): + if type(model['text']) == dict: + self.model = model + else: + self.model = compile(model) + if not supress_uncompiled_model_warning: + print('WARNING: model was not compiled', file=sys.stderr) + print('In development, this is OK, but precompiling the model is preferred for production use.', file=sys.stderr) + self.warn = supress_uncompiled_model_warning + + def check(self, text): + model = self.model + stopword_value = 0.5 + try: + stopword_value = model['stopword'] + except: + pass + stopwords = spacy.lang.en.stop_words.STOP_WORDS + model = model['text'] + text = listify(text) + probs = {} + for word in text: + for category in model.keys(): + for catword in model[category]: + if word == catword: + weight = ( stopword_value if word in stopwords else 1 ) / len(model[category]) + try: + probs[category] += weight + except: + probs[category] = weight + most_likely = ['unknown', 0] + for category in probs.keys(): + if probs[category] > most_likely[1]: + most_likely = [category, probs[category]] + return most_likely[0] diff --git a/build/lib/gptc/__main__.py b/build/lib/gptc/__main__.py new file mode 100644 index 0000000..ac04aa7 --- /dev/null +++ b/build/lib/gptc/__main__.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import argparse +import json + +parser = argparse.ArgumentParser(description="General Purpose Text Classifier") +parser.add_argument('model', help='model to use') +parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') +args = parser.parse_args() + +import gptc # PEP 8 violation, but don't fix it +# Way better for performance of argparse checking + +with open(args.model, 'r') as f: + raw_model = json.load(f) +if args.compile: + with open(args.compile, 'w+') as f: + json.dump(gptc.compile(raw_model), f) +else: + classifier = gptc.Classifier(raw_model) + if sys.stdin.isatty(): + text = input('Text to analyse: ') + else: + text = sys.stdin.read() + print(classifier.check(text)) diff --git a/dist/gptc-0.0.1.tar.gz b/dist/gptc-0.0.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..926e051783ae34673307468e334f1be608e977de GIT binary patch literal 3478 zcmV;H4QcWpiwFqbA%b24|72-%bT4OcbYm?rE-)@JE_7jX0PPxSQyWK?&;2Vp;)A4$ zG)RCT(Hhqhj0v$h){uC!6eU_SEve0DW-{G_to7!cQF`4DhqRJ?ND`-CF5yY;E>7*L&L=TPwZ(V12u{LI*2fU}aLV zl+qRH$8j=EnJ@UgaG%?Eg1FB=$(k1S$FFPuANJo`Umw)=|0C=_8i`1#(a1}dU&{W2 zet#pk|N3BarMKSO*w|d5z1sf2`&#{KEtBcmSVU_)x}nKZ&EjZ-hFQl%oA zdLf_B*$9vu?^$$uFr?+0EkP z^547P?Y-K6eenI^4{zQc9iN>3a?gqj?SLZUgifyRYQ!T~3)1-@wF^X6qchx3`@(>PrMY!2g|0b33X2`=I^ zKx&AhAf$<<%j$!i=L?7H(ql=&qo9RY+7)2_(dUW!ECS&00uNu3$EZjUqGj;2Ff>|` zc$9vkM);vQ)1ghN;si0mBeB-+{i>xDMxoJ^w|X7A-fz>(T;+;OWl$nefxqA#JTB!v zYQ9%I3z?K+B6tdq_wpaD0$aPHXKlLst%?gAufXW}Gwxq&m&|QTcXp_GP#E&Ab{(f* z@6dk9YaB~(*Z?$9@%DQTmw@A|RujQ_K$+2K(}*#Khj66O4n)Q?Gzj9|&&5lPx%8we zlOGzhcgKgv2Ucfv!KB9Lurxb#0-rmOvOTE+o~G@GFdasq*c%?k$(%>3L)A=33ja>t zbm%x!0LBegOvY-)Ek%HUoe5wDU(}f~jf2eB$uX1MdyE6EB$N<~$7eJ^a?_ijGj=Z( zl5(alv|rG&>1K4p!i-~z@CM|;6nY2RI!K2vo z&QDH9?@!*nI=drG#Wx=+F0@Xe){^BRjfUcyhu9fhG_&YBiWd=tQm@(+luf3w-fJ>Y+HI0A%QPmR=QgT>A)Cd^h=JXedj0v<-T80MMD*Lwe_NZI56yr5&E9%G|7~t<==rbT zujju%e*RTnmYMZkS4&Z3?a5 z#Lm=ib^%7wIA$iFw29C-rDugV(6><9JwUqnx`Gv##ujArq=iCycPD1(_S&Rqhbz;U zK2jRNic`dvDcdB9$PW%55gI6dDZ zHc59Ao1}F(l|mZ2&Qo0)2w7dFdv6cvc^rr2a6*@HMn8ZOVIYlkoCYzK@tji$ITEt2 zL*nbyNZ}t&l#os!_XRN2M38?nq&VXQ@hZUU0e1b6MbKvB+yjHCO@J@H>UQb-8OIAb zp#+1m2zObjq>y<0f%`Dn;&7&?AfO2CvYsbgy`Wd0y+SxoGk)cfZucefa;1~~3k`x5 z;^mb`4?%1ZhS(|36QvV7v)7d!it<_Mp>0+jAo^ehaM>_rMVxk(Bg;D{=>XOg!TCuJkGGbL)k`5Ae zAt3Gg|H;cKec`lJ)_``0w_=>Kbm_doIRc_Z=R)m3;DEsuctIF0bAyBV^+^w~tQr=| zRl`(_E3yZjxm>2oS>xE)bx3yK-DE~_W!HN55N;Ef4^tsl-#j@w|q zN%eXIJA@X~@KD}FZd@oIfNd8BbHo`rf_81z30XzrqJsrUGlF6nq=PeEO;I4EpGir8 z$lRkp112DkcmkUo0zLmDiCf4gVw$B|F>`F*fV*?Nypr5pL-CIww6OXq`aQ#V$La{+ zXYoQ~m$NtvAxf=-MFHUfD8ujz$lo~>ph6&P@aU^pbT9N13@>yo>FH=3^AJ$$8`znH zADRoTi@aC)R2jh=sOH9yHeR?3>9TQOHab;kw_#3HAag)-7j#L;ZU_O8O|8q#K){g8 zTq0Qx5J_9ecCIwpMpx0I=^b**nr$?vz9@uj8sHsGY8`rxGGK(|ddHD6#P-)`F4fxt z(8U6vb3O)u2~5zta|vAoN`{2g2M(JW?<)YRta)^uZeu*!o=Bqbda%O{fN@Zf8QyaPV1hZmuICIiv!DD5B#f zIb!KGJzp>sRGS8yk7A#`2lEKv@NCAexlA}jA*5}Xl45DIfORV5S?GJuYY1<4!||mD zoyHM2=@CUrvT-PUP#yYX$N2)D0)nV5sg@yH7W4`1PbOezEM+sMOaP(BaV7=$vf_Xq zkvce^qn}^v_m}8P(>M-Dz~?dK^HldfI$4#|k}><0Lh(nkkZ=Sb9?oA^e{;=J$-T`n za`%`y;*tNg|Cdl(lHc;)=ZXFQMsIy9zyBHZ2iX5_Z*SH8|Cd-3=v;NEW=e#3C4i~T zzVSzYAo~9n6qJa~@%2a(J*L^o3(_iPThk2yK!D#;XuSPvHpK!MLJFm=qb2l+IE>p3 z{ifJpy6k8qdoH4>L^@EL8i)^T8f0>b#T{sCM|H$k=^2>LBtdMu=NR!(HXj2;jBRLB zpsoW}nTt8S~Zv)-+l=5SZAo zZAn!H6B}dx*1ESP1W2(A@+n@&G|$j^547`^Oh-#bJYOcdXlWFv`TYEdL%PKKrDiDv z{$Bcyf)_TaAlmO9!C63+r5b}%TS;RjL~FTgt8>JozS^H;1ldhDxPfLVC(KY3`Z&dB@-|`?p##W zcu1#Q4?nJ`Gwe-cXeeB+Hos2%BhR&Uik` z$}GW@_+&cXz?%0-z+2A&-hK}7H_rk7Rs(}Ow(D1Uf5rRX(>Dj*!{gT{U%LP6Z*1K8 z``?YN^}7H266*-cL%Q%3L)vI3hs#m@0^_N@!1-zTG?Ep{! E0KW^_bN~PV literal 0 HcmV?d00001 diff --git a/gptc/__init__.py b/gptc/__init__.py index 43859a8..061c263 100755 --- a/gptc/__init__.py +++ b/gptc/__init__.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 import sys -import json import spacy -import argparse nlp = spacy.load('en_core_web_sm') @@ -76,22 +74,3 @@ class Classifier: if probs[category] > most_likely[1]: most_likely = [category, probs[category]] return most_likely[0] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="General Purpose Text Classifier") - parser.add_argument('model', help='model to use') - parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') - args = parser.parse_args() - with open(args.model, 'r') as f: - raw_model = json.load(f) - if args.compile: - with open(args.compile, 'w+') as f: - json.dump(compile(raw_model), f) - else: - classifier = Classifier(raw_model) - if sys.stdin.isatty(): - text = input('Text to analyse: ') - else: - text = sys.stdin.read() - print(classifier.check(text)) diff --git a/gptc/__main__.py b/gptc/__main__.py new file mode 100644 index 0000000..ac04aa7 --- /dev/null +++ b/gptc/__main__.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import argparse +import json + +parser = argparse.ArgumentParser(description="General Purpose Text Classifier") +parser.add_argument('model', help='model to use') +parser.add_argument('-c', '--compile', help='compile raw model model to outfile', metavar='outfile') +args = parser.parse_args() + +import gptc # PEP 8 violation, but don't fix it +# Way better for performance of argparse checking + +with open(args.model, 'r') as f: + raw_model = json.load(f) +if args.compile: + with open(args.compile, 'w+') as f: + json.dump(gptc.compile(raw_model), f) +else: + classifier = gptc.Classifier(raw_model) + if sys.stdin.isatty(): + text = input('Text to analyse: ') + else: + text = sys.stdin.read() + print(classifier.check(text)) diff --git a/setup.py b/setup.py index 4c3c988..c05da85 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from distutils.core import setup setup( name = 'gptc', # How you named your package folder (MyLib) packages = ['gptc'], # Chose the same as "name" - version = '0.0.0', # Start with a small number and increase it with every change you make + version = '0.0.1', # Start with a small number and increase it with every change you make license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository description = 'General-purpose English text classifier', # Give a short description about your library author = 'ScoopGracie', # Type in your name