Compare commits

...

7 Commits

Author SHA1 Message Date
b133facd70 Recompile model 2022-05-21 14:03:08 -07:00
4188045b75 New CLI tool 2022-05-21 14:02:20 -07:00
e06f2def24 Make pack a function 2022-05-21 13:09:53 -07:00
bebd286163 Fix heading 2022-05-21 12:54:31 -07:00
2d9f7cfc5a Add blank lines before and after headings in README 2022-05-20 17:22:37 -07:00
5378be9418 Format code 2022-05-20 17:16:00 -07:00
4ddeefad07 Remove automatic recompilation 2022-05-18 15:14:08 -07:00
6 changed files with 100 additions and 70 deletions

View File

@ -1,42 +1,54 @@
# GPTC # GPTC
General-purpose text classifier in Python General-purpose text classifier in Python
GPTC provides both a CLI tool and a Python library. GPTC provides both a CLI tool and a Python library.
## CLI Tool ## CLI Tool
### Classifying text ### Classifying text
python -m gptc <modelfile> python -m gptc classify <compiled model file>
This will prompt for a string and classify it, then print (in JSON) a dict of
the format `{category: probability, category:probability, ...}` to stdout.
Alternatively, if you only need the most likely category, you can use this:
python -m gptc classify [-c|--category] <compiled model file>
This will prompt for a string and classify it, outputting the category on This will prompt for a string and classify it, outputting the category on
stdout (or "None" if it cannot determine anything). stdout (or "None" if it cannot determine anything).
Alternatively, if you need confidence data, use:
python -m gptc -j <modelfile>
This will print (in JSON) a dict of the format `{category: probability,
category:probability, ...}` to stdout.
### Compiling models ### Compiling models
python -m gptc <raw model file> -c|--compile <compiled model file> python -m gptc compile <raw model file>
This will print the compiled model in JSON to stdout.
## Library ## Library
### `gptc.Classifier(model)` ### `gptc.Classifier(model)`
Create a `Classifier` object using the given *compiled* model (as a dict, not Create a `Classifier` object using the given *compiled* model (as a dict, not
JSON). JSON).
#### `Classifier.confidence(text)` #### `Classifier.confidence(text)`
Classify `text`. Returns a dict of the format `{category: probability, Classify `text`. Returns a dict of the format `{category: probability,
category:probability, ...}` category:probability, ...}`
#### `Classifier.classify(text)` #### `Classifier.classify(text)`
Classify `text`. Returns the category into which the text is placed (as a Classify `text`. Returns the category into which the text is placed (as a
string), or `None` when it cannot classify the text. string), or `None` when it cannot classify the text.
## `gptc.compile(raw_model)`
### `gptc.compile(raw_model)`
Compile a raw model (as a list, not JSON) and return the compiled model (as a Compile a raw model (as a list, not JSON) and return the compiled model (as a
dict). dict).
## Model format ## Model format
This section explains the raw model format, which is how you should create and This section explains the raw model format, which is how you should create and
edit models. edit models.
@ -55,6 +67,7 @@ in any way these Python objects can be. However, it is recommended to store
them in JSON format for compatibility with the command-line tool. them in JSON format for compatibility with the command-line tool.
## Example model ## Example model
An example model, which is designed to distinguish between texts written by An example model, which is designed to distinguish between texts written by
Mark Twain and those written by William Shakespeare, is available in `models`. Mark Twain and those written by William Shakespeare, is available in `models`.
The raw model is in `models/raw.json`; the compiled model is in The raw model is in `models/raw.json`; the compiled model is in

View File

@ -6,28 +6,44 @@ import json
import sys import sys
import gptc import gptc
parser = argparse.ArgumentParser(description="General Purpose Text Classifier") parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
parser.add_argument("model", help="model to use") subparsers = parser.add_subparsers(dest="subparser_name", required=True)
parser.add_argument(
"-c", "--compile", help="compile raw model model to outfile", metavar="outfile" compile_parser = subparsers.add_parser('compile', help='compile a raw model')
compile_parser.add_argument("model", help="raw model to compile")
classify_parser = subparsers.add_parser('classify', help='classify text')
classify_parser.add_argument("model", help="compiled model to use")
group = classify_parser.add_mutually_exclusive_group()
group.add_argument(
"-j",
"--json",
help="output confidence dict as JSON (default)",
action="store_true",
) )
parser.add_argument( group.add_argument(
"-j", "--confidence", help="output confidence dict in json", action="store_true" "-c",
"--category",
help="output most likely category or `None`",
action="store_true",
) )
args = parser.parse_args() args = parser.parse_args()
with open(args.model, "r") as f: with open(args.model, "r") as f:
raw_model = json.load(f) model = json.load(f)
if args.compile:
with open(args.compile, "w+") as f: if args.subparser_name == 'compile':
json.dump(gptc.compile(raw_model), f) print(json.dumps(gptc.compile(model)))
else: else:
classifier = gptc.Classifier(raw_model) classifier = gptc.Classifier(model)
if sys.stdin.isatty(): if sys.stdin.isatty():
text = input("Text to analyse: ") text = input("Text to analyse: ")
else: else:
text = sys.stdin.read() text = sys.stdin.read()
if args.confidence:
print(json.dumps(classifier.confidence(text))) if args.category:
else:
print(classifier.classify(text)) print(classifier.classify(text))
else:
print(json.dumps(classifier.confidence(text)))

View File

@ -20,26 +20,11 @@ class Classifier:
""" """
def __init__(self, model): def __init__(self, model):
try: if model.get("__version__", 0) != 3:
model_version = model["__version__"] raise gptc.exceptions.UnsupportedModelError(
except: f"unsupported model version"
model_version = 1
if model_version == 3:
self.model = model
else:
# The model is an unsupported version
try:
raw_model = model["__raw__"]
except:
raise gptc.exceptions.UnsupportedModelError(
"this model is unsupported and does not contain a raw model for recompiling"
)
warnings.warn(
"model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future"
) )
self.model = gptc.compiler.compile(raw_model) self.model = model
def confidence(self, text): def confidence(self, text):
"""Classify text with confidence. """Classify text with confidence.
@ -74,7 +59,8 @@ class Classifier:
except KeyError: except KeyError:
pass pass
probs = { probs = {
model["__names__"][category]: value for category, value in probs.items() model["__names__"][category]: value
for category, value in probs.items()
} }
total = sum(probs.values()) total = sum(probs.values())
probs = {category: value / total for category, value in probs.items()} probs = {category: value / total for category, value in probs.items()}

View File

@ -39,9 +39,13 @@ def compile(raw_model):
categories_by_count[category] = {} categories_by_count[category] = {}
for word in text: for word in text:
try: try:
categories_by_count[category][word] += 1 / len(categories[category]) categories_by_count[category][word] += 1 / len(
categories[category]
)
except KeyError: except KeyError:
categories_by_count[category][word] = 1 / len(categories[category]) categories_by_count[category][word] = 1 / len(
categories[category]
)
word_weights = {} word_weights = {}
for category, words in categories_by_count.items(): for category, words in categories_by_count.items():
for word, value in words.items(): for word, value in words.items():
@ -55,11 +59,12 @@ def compile(raw_model):
total = sum(weights.values()) total = sum(weights.values())
model[word] = [] model[word] = []
for category in names: for category in names:
model[word].append(round((weights.get(category, 0) / total) * 65535)) model[word].append(
round((weights.get(category, 0) / total) * 65535)
)
model["__names__"] = names model["__names__"] = names
model["__version__"] = 3 model["__version__"] = 3
model["__raw__"] = raw_model
return model return model

File diff suppressed because one or more lines are too long

View File

@ -4,28 +4,38 @@ import sys
import os import os
import json import json
def pack(directory, print_exceptions=True):
paths = os.listdir(directory)
texts = {}
exceptions = []
for path in paths:
texts[path] = []
try:
for file in os.listdir(os.path.join(sys.argv[1], path)):
try:
with open(os.path.join(sys.argv[1], path, file)) as f:
texts[path].append(f.read())
except Exception as e:
exceptions.append((e,))
if print_exceptions:
print(e, file=sys.stderr)
except Exception as e:
exceptions.append((e,))
if print_exceptions:
print(e, file=sys.stderr)
raw_model = []
for category, cat_texts in texts.items():
raw_model += [{"category": category, "text": i} for i in cat_texts]
return raw_model, exceptions
if len(sys.argv) != 2: if len(sys.argv) != 2:
print("usage: pack.py <path>", file=sys.stderr) print("usage: pack.py <path>", file=sys.stderr)
exit(1) exit(1)
paths = os.listdir(sys.argv[1]) print(json.dumps(pack(sys.argv[1])[0]))
texts = {}
for path in paths:
texts[path] = []
try:
for file in os.listdir(os.path.join(sys.argv[1], path)):
try:
with open(os.path.join(sys.argv[1], path, file)) as f:
texts[path].append(f.read())
except Exception as e:
print(e, file=sys.stderr)
except Exception as e:
print(e, file=sys.stderr)
raw_model = []
for category, cat_texts in texts.items():
raw_model += [{"category": category, "text": i} for i in cat_texts]
print(json.dumps(raw_model))