Compare commits

..

No commits in common. "b133facd70f97312c92978fa92bce393c7b172a4" and "75bae768b642de122280b1583a901000d7dd0ce1" have entirely different histories.

6 changed files with 70 additions and 100 deletions

View File

@ -1,54 +1,42 @@
# GPTC # GPTC
General-purpose text classifier in Python General-purpose text classifier in Python
GPTC provides both a CLI tool and a Python library. GPTC provides both a CLI tool and a Python library.
## CLI Tool ## CLI Tool
### Classifying text ### Classifying text
python -m gptc classify <compiled model file> python -m gptc <modelfile>
This will prompt for a string and classify it, then print (in JSON) a dict of
the format `{category: probability, category:probability, ...}` to stdout.
Alternatively, if you only need the most likely category, you can use this:
python -m gptc classify [-c|--category] <compiled model file>
This will prompt for a string and classify it, outputting the category on This will prompt for a string and classify it, outputting the category on
stdout (or "None" if it cannot determine anything). stdout (or "None" if it cannot determine anything).
Alternatively, if you need confidence data, use:
python -m gptc -j <modelfile>
This will print (in JSON) a dict of the format `{category: probability,
category:probability, ...}` to stdout.
### Compiling models ### Compiling models
python -m gptc compile <raw model file> python -m gptc <raw model file> -c|--compile <compiled model file>
This will print the compiled model in JSON to stdout.
## Library ## Library
### `gptc.Classifier(model)` ### `gptc.Classifier(model)`
Create a `Classifier` object using the given *compiled* model (as a dict, not Create a `Classifier` object using the given *compiled* model (as a dict, not
JSON). JSON).
#### `Classifier.confidence(text)` #### `Classifier.confidence(text)`
Classify `text`. Returns a dict of the format `{category: probability, Classify `text`. Returns a dict of the format `{category: probability,
category:probability, ...}` category:probability, ...}`
#### `Classifier.classify(text)` #### `Classifier.classify(text)`
Classify `text`. Returns the category into which the text is placed (as a Classify `text`. Returns the category into which the text is placed (as a
string), or `None` when it cannot classify the text. string), or `None` when it cannot classify the text.
## `gptc.compile(raw_model)`
### `gptc.compile(raw_model)`
Compile a raw model (as a list, not JSON) and return the compiled model (as a Compile a raw model (as a list, not JSON) and return the compiled model (as a
dict). dict).
## Model format ## Model format
This section explains the raw model format, which is how you should create and This section explains the raw model format, which is how you should create and
edit models. edit models.
@ -67,7 +55,6 @@ in any way these Python objects can be. However, it is recommended to store
them in JSON format for compatibility with the command-line tool. them in JSON format for compatibility with the command-line tool.
## Example model ## Example model
An example model, which is designed to distinguish between texts written by An example model, which is designed to distinguish between texts written by
Mark Twain and those written by William Shakespeare, is available in `models`. Mark Twain and those written by William Shakespeare, is available in `models`.
The raw model is in `models/raw.json`; the compiled model is in The raw model is in `models/raw.json`; the compiled model is in

View File

@ -6,44 +6,28 @@ import json
import sys import sys
import gptc import gptc
parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc') parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
subparsers = parser.add_subparsers(dest="subparser_name", required=True) parser.add_argument("model", help="model to use")
parser.add_argument(
compile_parser = subparsers.add_parser('compile', help='compile a raw model') "-c", "--compile", help="compile raw model model to outfile", metavar="outfile"
compile_parser.add_argument("model", help="raw model to compile")
classify_parser = subparsers.add_parser('classify', help='classify text')
classify_parser.add_argument("model", help="compiled model to use")
group = classify_parser.add_mutually_exclusive_group()
group.add_argument(
"-j",
"--json",
help="output confidence dict as JSON (default)",
action="store_true",
) )
group.add_argument( parser.add_argument(
"-c", "-j", "--confidence", help="output confidence dict in json", action="store_true"
"--category",
help="output most likely category or `None`",
action="store_true",
) )
args = parser.parse_args() args = parser.parse_args()
with open(args.model, "r") as f: with open(args.model, "r") as f:
model = json.load(f) raw_model = json.load(f)
if args.compile:
if args.subparser_name == 'compile': with open(args.compile, "w+") as f:
print(json.dumps(gptc.compile(model))) json.dump(gptc.compile(raw_model), f)
else: else:
classifier = gptc.Classifier(model) classifier = gptc.Classifier(raw_model)
if sys.stdin.isatty(): if sys.stdin.isatty():
text = input("Text to analyse: ") text = input("Text to analyse: ")
else: else:
text = sys.stdin.read() text = sys.stdin.read()
if args.confidence:
if args.category:
print(classifier.classify(text))
else:
print(json.dumps(classifier.confidence(text))) print(json.dumps(classifier.confidence(text)))
else:
print(classifier.classify(text))

View File

@ -20,11 +20,26 @@ class Classifier:
""" """
def __init__(self, model): def __init__(self, model):
if model.get("__version__", 0) != 3: try:
raise gptc.exceptions.UnsupportedModelError( model_version = model["__version__"]
f"unsupported model version" except:
model_version = 1
if model_version == 3:
self.model = model
else:
# The model is an unsupported version
try:
raw_model = model["__raw__"]
except:
raise gptc.exceptions.UnsupportedModelError(
"this model is unsupported and does not contain a raw model for recompiling"
)
warnings.warn(
"model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future"
) )
self.model = model self.model = gptc.compiler.compile(raw_model)
def confidence(self, text): def confidence(self, text):
"""Classify text with confidence. """Classify text with confidence.
@ -59,8 +74,7 @@ class Classifier:
except KeyError: except KeyError:
pass pass
probs = { probs = {
model["__names__"][category]: value model["__names__"][category]: value for category, value in probs.items()
for category, value in probs.items()
} }
total = sum(probs.values()) total = sum(probs.values())
probs = {category: value / total for category, value in probs.items()} probs = {category: value / total for category, value in probs.items()}

View File

@ -39,13 +39,9 @@ def compile(raw_model):
categories_by_count[category] = {} categories_by_count[category] = {}
for word in text: for word in text:
try: try:
categories_by_count[category][word] += 1 / len( categories_by_count[category][word] += 1 / len(categories[category])
categories[category]
)
except KeyError: except KeyError:
categories_by_count[category][word] = 1 / len( categories_by_count[category][word] = 1 / len(categories[category])
categories[category]
)
word_weights = {} word_weights = {}
for category, words in categories_by_count.items(): for category, words in categories_by_count.items():
for word, value in words.items(): for word, value in words.items():
@ -59,12 +55,11 @@ def compile(raw_model):
total = sum(weights.values()) total = sum(weights.values())
model[word] = [] model[word] = []
for category in names: for category in names:
model[word].append( model[word].append(round((weights.get(category, 0) / total) * 65535))
round((weights.get(category, 0) / total) * 65535)
)
model["__names__"] = names model["__names__"] = names
model["__version__"] = 3 model["__version__"] = 3
model["__raw__"] = raw_model
return model return model

File diff suppressed because one or more lines are too long

View File

@ -4,38 +4,28 @@ import sys
import os import os
import json import json
def pack(directory, print_exceptions=True):
paths = os.listdir(directory)
texts = {}
exceptions = []
for path in paths:
texts[path] = []
try:
for file in os.listdir(os.path.join(sys.argv[1], path)):
try:
with open(os.path.join(sys.argv[1], path, file)) as f:
texts[path].append(f.read())
except Exception as e:
exceptions.append((e,))
if print_exceptions:
print(e, file=sys.stderr)
except Exception as e:
exceptions.append((e,))
if print_exceptions:
print(e, file=sys.stderr)
raw_model = []
for category, cat_texts in texts.items():
raw_model += [{"category": category, "text": i} for i in cat_texts]
return raw_model, exceptions
if len(sys.argv) != 2: if len(sys.argv) != 2:
print("usage: pack.py <path>", file=sys.stderr) print("usage: pack.py <path>", file=sys.stderr)
exit(1) exit(1)
print(json.dumps(pack(sys.argv[1])[0])) paths = os.listdir(sys.argv[1])
texts = {}
for path in paths:
texts[path] = []
try:
for file in os.listdir(os.path.join(sys.argv[1], path)):
try:
with open(os.path.join(sys.argv[1], path, file)) as f:
texts[path].append(f.read())
except Exception as e:
print(e, file=sys.stderr)
except Exception as e:
print(e, file=sys.stderr)
raw_model = []
for category, cat_texts in texts.items():
raw_model += [{"category": category, "text": i} for i in cat_texts]
print(json.dumps(raw_model))