Compare commits

...

7 Commits

Author SHA1 Message Date
b133facd70 Recompile model 2022-05-21 14:03:08 -07:00
4188045b75 New CLI tool 2022-05-21 14:02:20 -07:00
e06f2def24 Make pack a function 2022-05-21 13:09:53 -07:00
bebd286163 Fix heading 2022-05-21 12:54:31 -07:00
2d9f7cfc5a Add blank lines before and after headings in README 2022-05-20 17:22:37 -07:00
5378be9418 Format code 2022-05-20 17:16:00 -07:00
4ddeefad07 Remove automatic recompilation 2022-05-18 15:14:08 -07:00
6 changed files with 100 additions and 70 deletions

View File

@ -1,42 +1,54 @@
# GPTC
General-purpose text classifier in Python
GPTC provides both a CLI tool and a Python library.
## CLI Tool
### Classifying text
python -m gptc <modelfile>
python -m gptc classify <compiled model file>
This will prompt for a string and classify it, then print (in JSON) a dict of
the format `{category: probability, category:probability, ...}` to stdout.
Alternatively, if you only need the most likely category, you can use this:
python -m gptc classify [-c|--category] <compiled model file>
This will prompt for a string and classify it, outputting the category on
stdout (or "None" if it cannot determine anything).
Alternatively, if you need confidence data, use:
python -m gptc -j <modelfile>
This will print (in JSON) a dict of the format `{category: probability,
category:probability, ...}` to stdout.
### Compiling models
python -m gptc <raw model file> -c|--compile <compiled model file>
python -m gptc compile <raw model file>
This will print the compiled model in JSON to stdout.
## Library
### `gptc.Classifier(model)`
Create a `Classifier` object using the given *compiled* model (as a dict, not
JSON).
#### `Classifier.confidence(text)`
Classify `text`. Returns a dict of the format `{category: probability,
category:probability, ...}`
#### `Classifier.classify(text)`
Classify `text`. Returns the category into which the text is placed (as a
string), or `None` when it cannot classify the text.
## `gptc.compile(raw_model)`
### `gptc.compile(raw_model)`
Compile a raw model (as a list, not JSON) and return the compiled model (as a
dict).
## Model format
This section explains the raw model format, which is how you should create and
edit models.
@ -55,6 +67,7 @@ in any way these Python objects can be. However, it is recommended to store
them in JSON format for compatibility with the command-line tool.
## Example model
An example model, which is designed to distinguish between texts written by
Mark Twain and those written by William Shakespeare, is available in `models`.
The raw model is in `models/raw.json`; the compiled model is in

View File

@ -6,28 +6,44 @@ import json
import sys
import gptc
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
parser.add_argument("model", help="model to use")
parser.add_argument(
"-c", "--compile", help="compile raw model model to outfile", metavar="outfile"
parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
compile_parser = subparsers.add_parser('compile', help='compile a raw model')
compile_parser.add_argument("model", help="raw model to compile")
classify_parser = subparsers.add_parser('classify', help='classify text')
classify_parser.add_argument("model", help="compiled model to use")
group = classify_parser.add_mutually_exclusive_group()
group.add_argument(
"-j",
"--json",
help="output confidence dict as JSON (default)",
action="store_true",
)
parser.add_argument(
"-j", "--confidence", help="output confidence dict in json", action="store_true"
group.add_argument(
"-c",
"--category",
help="output most likely category or `None`",
action="store_true",
)
args = parser.parse_args()
with open(args.model, "r") as f:
raw_model = json.load(f)
if args.compile:
with open(args.compile, "w+") as f:
json.dump(gptc.compile(raw_model), f)
model = json.load(f)
if args.subparser_name == 'compile':
print(json.dumps(gptc.compile(model)))
else:
classifier = gptc.Classifier(raw_model)
classifier = gptc.Classifier(model)
if sys.stdin.isatty():
text = input("Text to analyse: ")
else:
text = sys.stdin.read()
if args.confidence:
print(json.dumps(classifier.confidence(text)))
else:
if args.category:
print(classifier.classify(text))
else:
print(json.dumps(classifier.confidence(text)))

View File

@ -20,26 +20,11 @@ class Classifier:
"""
def __init__(self, model):
try:
model_version = model["__version__"]
except:
model_version = 1
if model_version == 3:
self.model = model
else:
# The model is an unsupported version
try:
raw_model = model["__raw__"]
except:
raise gptc.exceptions.UnsupportedModelError(
"this model is unsupported and does not contain a raw model for recompiling"
)
warnings.warn(
"model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future"
if model.get("__version__", 0) != 3:
raise gptc.exceptions.UnsupportedModelError(
f"unsupported model version"
)
self.model = gptc.compiler.compile(raw_model)
self.model = model
def confidence(self, text):
"""Classify text with confidence.
@ -74,7 +59,8 @@ class Classifier:
except KeyError:
pass
probs = {
model["__names__"][category]: value for category, value in probs.items()
model["__names__"][category]: value
for category, value in probs.items()
}
total = sum(probs.values())
probs = {category: value / total for category, value in probs.items()}

View File

@ -39,9 +39,13 @@ def compile(raw_model):
categories_by_count[category] = {}
for word in text:
try:
categories_by_count[category][word] += 1 / len(categories[category])
categories_by_count[category][word] += 1 / len(
categories[category]
)
except KeyError:
categories_by_count[category][word] = 1 / len(categories[category])
categories_by_count[category][word] = 1 / len(
categories[category]
)
word_weights = {}
for category, words in categories_by_count.items():
for word, value in words.items():
@ -55,11 +59,12 @@ def compile(raw_model):
total = sum(weights.values())
model[word] = []
for category in names:
model[word].append(round((weights.get(category, 0) / total) * 65535))
model[word].append(
round((weights.get(category, 0) / total) * 65535)
)
model["__names__"] = names
model["__version__"] = 3
model["__raw__"] = raw_model
return model

File diff suppressed because one or more lines are too long

View File

@ -4,28 +4,38 @@ import sys
import os
import json
def pack(directory, print_exceptions=True):
paths = os.listdir(directory)
texts = {}
exceptions = []
for path in paths:
texts[path] = []
try:
for file in os.listdir(os.path.join(sys.argv[1], path)):
try:
with open(os.path.join(sys.argv[1], path, file)) as f:
texts[path].append(f.read())
except Exception as e:
exceptions.append((e,))
if print_exceptions:
print(e, file=sys.stderr)
except Exception as e:
exceptions.append((e,))
if print_exceptions:
print(e, file=sys.stderr)
raw_model = []
for category, cat_texts in texts.items():
raw_model += [{"category": category, "text": i} for i in cat_texts]
return raw_model, exceptions
if len(sys.argv) != 2:
print("usage: pack.py <path>", file=sys.stderr)
exit(1)
paths = os.listdir(sys.argv[1])
texts = {}
for path in paths:
texts[path] = []
try:
for file in os.listdir(os.path.join(sys.argv[1], path)):
try:
with open(os.path.join(sys.argv[1], path, file)) as f:
texts[path].append(f.read())
except Exception as e:
print(e, file=sys.stderr)
except Exception as e:
print(e, file=sys.stderr)
raw_model = []
for category, cat_texts in texts.items():
raw_model += [{"category": category, "text": i} for i in cat_texts]
print(json.dumps(raw_model))
print(json.dumps(pack(sys.argv[1])[0]))