Compare commits
7 Commits
75bae768b6
...
b133facd70
Author | SHA1 | Date | |
---|---|---|---|
b133facd70 | |||
4188045b75 | |||
e06f2def24 | |||
bebd286163 | |||
2d9f7cfc5a | |||
5378be9418 | |||
4ddeefad07 |
33
README.md
33
README.md
|
@ -1,42 +1,54 @@
|
|||
# GPTC
|
||||
|
||||
General-purpose text classifier in Python
|
||||
|
||||
GPTC provides both a CLI tool and a Python library.
|
||||
|
||||
## CLI Tool
|
||||
|
||||
### Classifying text
|
||||
|
||||
python -m gptc <modelfile>
|
||||
python -m gptc classify <compiled model file>
|
||||
|
||||
This will prompt for a string and classify it, then print (in JSON) a dict of
|
||||
the format `{category: probability, category:probability, ...}` to stdout.
|
||||
|
||||
Alternatively, if you only need the most likely category, you can use this:
|
||||
|
||||
python -m gptc classify [-c|--category] <compiled model file>
|
||||
|
||||
This will prompt for a string and classify it, outputting the category on
|
||||
stdout (or "None" if it cannot determine anything).
|
||||
|
||||
Alternatively, if you need confidence data, use:
|
||||
|
||||
python -m gptc -j <modelfile>
|
||||
|
||||
This will print (in JSON) a dict of the format `{category: probability,
|
||||
category:probability, ...}` to stdout.
|
||||
|
||||
### Compiling models
|
||||
|
||||
python -m gptc <raw model file> -c|--compile <compiled model file>
|
||||
python -m gptc compile <raw model file>
|
||||
|
||||
This will print the compiled model in JSON to stdout.
|
||||
|
||||
## Library
|
||||
|
||||
### `gptc.Classifier(model)`
|
||||
|
||||
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
||||
JSON).
|
||||
|
||||
#### `Classifier.confidence(text)`
|
||||
|
||||
Classify `text`. Returns a dict of the format `{category: probability,
|
||||
category:probability, ...}`
|
||||
|
||||
#### `Classifier.classify(text)`
|
||||
|
||||
Classify `text`. Returns the category into which the text is placed (as a
|
||||
string), or `None` when it cannot classify the text.
|
||||
## `gptc.compile(raw_model)`
|
||||
|
||||
### `gptc.compile(raw_model)`
|
||||
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
||||
dict).
|
||||
|
||||
## Model format
|
||||
|
||||
This section explains the raw model format, which is how you should create and
|
||||
edit models.
|
||||
|
||||
|
@ -55,6 +67,7 @@ in any way these Python objects can be. However, it is recommended to store
|
|||
them in JSON format for compatibility with the command-line tool.
|
||||
|
||||
## Example model
|
||||
|
||||
An example model, which is designed to distinguish between texts written by
|
||||
Mark Twain and those written by William Shakespeare, is available in `models`.
|
||||
The raw model is in `models/raw.json`; the compiled model is in
|
||||
|
|
|
@ -6,28 +6,44 @@ import json
|
|||
import sys
|
||||
import gptc
|
||||
|
||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
||||
parser.add_argument("model", help="model to use")
|
||||
parser.add_argument(
|
||||
"-c", "--compile", help="compile raw model model to outfile", metavar="outfile"
|
||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
|
||||
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
|
||||
|
||||
compile_parser = subparsers.add_parser('compile', help='compile a raw model')
|
||||
compile_parser.add_argument("model", help="raw model to compile")
|
||||
|
||||
classify_parser = subparsers.add_parser('classify', help='classify text')
|
||||
classify_parser.add_argument("model", help="compiled model to use")
|
||||
group = classify_parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
"-j",
|
||||
"--json",
|
||||
help="output confidence dict as JSON (default)",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-j", "--confidence", help="output confidence dict in json", action="store_true"
|
||||
group.add_argument(
|
||||
"-c",
|
||||
"--category",
|
||||
help="output most likely category or `None`",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.model, "r") as f:
|
||||
raw_model = json.load(f)
|
||||
if args.compile:
|
||||
with open(args.compile, "w+") as f:
|
||||
json.dump(gptc.compile(raw_model), f)
|
||||
model = json.load(f)
|
||||
|
||||
if args.subparser_name == 'compile':
|
||||
print(json.dumps(gptc.compile(model)))
|
||||
else:
|
||||
classifier = gptc.Classifier(raw_model)
|
||||
classifier = gptc.Classifier(model)
|
||||
|
||||
if sys.stdin.isatty():
|
||||
text = input("Text to analyse: ")
|
||||
else:
|
||||
text = sys.stdin.read()
|
||||
if args.confidence:
|
||||
print(json.dumps(classifier.confidence(text)))
|
||||
else:
|
||||
|
||||
if args.category:
|
||||
print(classifier.classify(text))
|
||||
else:
|
||||
print(json.dumps(classifier.confidence(text)))
|
||||
|
|
|
@ -20,26 +20,11 @@ class Classifier:
|
|||
"""
|
||||
|
||||
def __init__(self, model):
|
||||
try:
|
||||
model_version = model["__version__"]
|
||||
except:
|
||||
model_version = 1
|
||||
|
||||
if model_version == 3:
|
||||
self.model = model
|
||||
else:
|
||||
# The model is an unsupported version
|
||||
try:
|
||||
raw_model = model["__raw__"]
|
||||
except:
|
||||
raise gptc.exceptions.UnsupportedModelError(
|
||||
"this model is unsupported and does not contain a raw model for recompiling"
|
||||
)
|
||||
|
||||
warnings.warn(
|
||||
"model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future"
|
||||
if model.get("__version__", 0) != 3:
|
||||
raise gptc.exceptions.UnsupportedModelError(
|
||||
f"unsupported model version"
|
||||
)
|
||||
self.model = gptc.compiler.compile(raw_model)
|
||||
self.model = model
|
||||
|
||||
def confidence(self, text):
|
||||
"""Classify text with confidence.
|
||||
|
@ -74,7 +59,8 @@ class Classifier:
|
|||
except KeyError:
|
||||
pass
|
||||
probs = {
|
||||
model["__names__"][category]: value for category, value in probs.items()
|
||||
model["__names__"][category]: value
|
||||
for category, value in probs.items()
|
||||
}
|
||||
total = sum(probs.values())
|
||||
probs = {category: value / total for category, value in probs.items()}
|
||||
|
|
|
@ -39,9 +39,13 @@ def compile(raw_model):
|
|||
categories_by_count[category] = {}
|
||||
for word in text:
|
||||
try:
|
||||
categories_by_count[category][word] += 1 / len(categories[category])
|
||||
categories_by_count[category][word] += 1 / len(
|
||||
categories[category]
|
||||
)
|
||||
except KeyError:
|
||||
categories_by_count[category][word] = 1 / len(categories[category])
|
||||
categories_by_count[category][word] = 1 / len(
|
||||
categories[category]
|
||||
)
|
||||
word_weights = {}
|
||||
for category, words in categories_by_count.items():
|
||||
for word, value in words.items():
|
||||
|
@ -55,11 +59,12 @@ def compile(raw_model):
|
|||
total = sum(weights.values())
|
||||
model[word] = []
|
||||
for category in names:
|
||||
model[word].append(round((weights.get(category, 0) / total) * 65535))
|
||||
model[word].append(
|
||||
round((weights.get(category, 0) / total) * 65535)
|
||||
)
|
||||
|
||||
model["__names__"] = names
|
||||
|
||||
model["__version__"] = 3
|
||||
model["__raw__"] = raw_model
|
||||
|
||||
return model
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -4,28 +4,38 @@ import sys
|
|||
import os
|
||||
import json
|
||||
|
||||
|
||||
def pack(directory, print_exceptions=True):
|
||||
paths = os.listdir(directory)
|
||||
texts = {}
|
||||
exceptions = []
|
||||
|
||||
for path in paths:
|
||||
texts[path] = []
|
||||
try:
|
||||
for file in os.listdir(os.path.join(sys.argv[1], path)):
|
||||
try:
|
||||
with open(os.path.join(sys.argv[1], path, file)) as f:
|
||||
texts[path].append(f.read())
|
||||
except Exception as e:
|
||||
exceptions.append((e,))
|
||||
if print_exceptions:
|
||||
print(e, file=sys.stderr)
|
||||
except Exception as e:
|
||||
exceptions.append((e,))
|
||||
if print_exceptions:
|
||||
print(e, file=sys.stderr)
|
||||
|
||||
raw_model = []
|
||||
|
||||
for category, cat_texts in texts.items():
|
||||
raw_model += [{"category": category, "text": i} for i in cat_texts]
|
||||
|
||||
return raw_model, exceptions
|
||||
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print("usage: pack.py <path>", file=sys.stderr)
|
||||
exit(1)
|
||||
|
||||
paths = os.listdir(sys.argv[1])
|
||||
texts = {}
|
||||
|
||||
for path in paths:
|
||||
texts[path] = []
|
||||
try:
|
||||
for file in os.listdir(os.path.join(sys.argv[1], path)):
|
||||
try:
|
||||
with open(os.path.join(sys.argv[1], path, file)) as f:
|
||||
texts[path].append(f.read())
|
||||
except Exception as e:
|
||||
print(e, file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(e, file=sys.stderr)
|
||||
|
||||
raw_model = []
|
||||
|
||||
for category, cat_texts in texts.items():
|
||||
raw_model += [{"category": category, "text": i} for i in cat_texts]
|
||||
|
||||
print(json.dumps(raw_model))
|
||||
print(json.dumps(pack(sys.argv[1])[0]))
|
||||
|
|
Loading…
Reference in New Issue
Block a user