Compare commits
7 Commits
75bae768b6
...
b133facd70
Author | SHA1 | Date | |
---|---|---|---|
b133facd70 | |||
4188045b75 | |||
e06f2def24 | |||
bebd286163 | |||
2d9f7cfc5a | |||
5378be9418 | |||
4ddeefad07 |
33
README.md
33
README.md
|
@ -1,42 +1,54 @@
|
||||||
# GPTC
|
# GPTC
|
||||||
|
|
||||||
General-purpose text classifier in Python
|
General-purpose text classifier in Python
|
||||||
|
|
||||||
GPTC provides both a CLI tool and a Python library.
|
GPTC provides both a CLI tool and a Python library.
|
||||||
|
|
||||||
## CLI Tool
|
## CLI Tool
|
||||||
|
|
||||||
### Classifying text
|
### Classifying text
|
||||||
|
|
||||||
python -m gptc <modelfile>
|
python -m gptc classify <compiled model file>
|
||||||
|
|
||||||
|
This will prompt for a string and classify it, then print (in JSON) a dict of
|
||||||
|
the format `{category: probability, category:probability, ...}` to stdout.
|
||||||
|
|
||||||
|
Alternatively, if you only need the most likely category, you can use this:
|
||||||
|
|
||||||
|
python -m gptc classify [-c|--category] <compiled model file>
|
||||||
|
|
||||||
This will prompt for a string and classify it, outputting the category on
|
This will prompt for a string and classify it, outputting the category on
|
||||||
stdout (or "None" if it cannot determine anything).
|
stdout (or "None" if it cannot determine anything).
|
||||||
|
|
||||||
Alternatively, if you need confidence data, use:
|
|
||||||
|
|
||||||
python -m gptc -j <modelfile>
|
|
||||||
|
|
||||||
This will print (in JSON) a dict of the format `{category: probability,
|
|
||||||
category:probability, ...}` to stdout.
|
|
||||||
|
|
||||||
### Compiling models
|
### Compiling models
|
||||||
|
|
||||||
python -m gptc <raw model file> -c|--compile <compiled model file>
|
python -m gptc compile <raw model file>
|
||||||
|
|
||||||
|
This will print the compiled model in JSON to stdout.
|
||||||
|
|
||||||
## Library
|
## Library
|
||||||
|
|
||||||
### `gptc.Classifier(model)`
|
### `gptc.Classifier(model)`
|
||||||
|
|
||||||
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
||||||
JSON).
|
JSON).
|
||||||
|
|
||||||
#### `Classifier.confidence(text)`
|
#### `Classifier.confidence(text)`
|
||||||
|
|
||||||
Classify `text`. Returns a dict of the format `{category: probability,
|
Classify `text`. Returns a dict of the format `{category: probability,
|
||||||
category:probability, ...}`
|
category:probability, ...}`
|
||||||
|
|
||||||
#### `Classifier.classify(text)`
|
#### `Classifier.classify(text)`
|
||||||
|
|
||||||
Classify `text`. Returns the category into which the text is placed (as a
|
Classify `text`. Returns the category into which the text is placed (as a
|
||||||
string), or `None` when it cannot classify the text.
|
string), or `None` when it cannot classify the text.
|
||||||
## `gptc.compile(raw_model)`
|
|
||||||
|
### `gptc.compile(raw_model)`
|
||||||
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
||||||
dict).
|
dict).
|
||||||
|
|
||||||
## Model format
|
## Model format
|
||||||
|
|
||||||
This section explains the raw model format, which is how you should create and
|
This section explains the raw model format, which is how you should create and
|
||||||
edit models.
|
edit models.
|
||||||
|
|
||||||
|
@ -55,6 +67,7 @@ in any way these Python objects can be. However, it is recommended to store
|
||||||
them in JSON format for compatibility with the command-line tool.
|
them in JSON format for compatibility with the command-line tool.
|
||||||
|
|
||||||
## Example model
|
## Example model
|
||||||
|
|
||||||
An example model, which is designed to distinguish between texts written by
|
An example model, which is designed to distinguish between texts written by
|
||||||
Mark Twain and those written by William Shakespeare, is available in `models`.
|
Mark Twain and those written by William Shakespeare, is available in `models`.
|
||||||
The raw model is in `models/raw.json`; the compiled model is in
|
The raw model is in `models/raw.json`; the compiled model is in
|
||||||
|
|
|
@ -6,28 +6,44 @@ import json
|
||||||
import sys
|
import sys
|
||||||
import gptc
|
import gptc
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier")
|
parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
|
||||||
parser.add_argument("model", help="model to use")
|
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
|
||||||
parser.add_argument(
|
|
||||||
"-c", "--compile", help="compile raw model model to outfile", metavar="outfile"
|
compile_parser = subparsers.add_parser('compile', help='compile a raw model')
|
||||||
|
compile_parser.add_argument("model", help="raw model to compile")
|
||||||
|
|
||||||
|
classify_parser = subparsers.add_parser('classify', help='classify text')
|
||||||
|
classify_parser.add_argument("model", help="compiled model to use")
|
||||||
|
group = classify_parser.add_mutually_exclusive_group()
|
||||||
|
group.add_argument(
|
||||||
|
"-j",
|
||||||
|
"--json",
|
||||||
|
help="output confidence dict as JSON (default)",
|
||||||
|
action="store_true",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
group.add_argument(
|
||||||
"-j", "--confidence", help="output confidence dict in json", action="store_true"
|
"-c",
|
||||||
|
"--category",
|
||||||
|
help="output most likely category or `None`",
|
||||||
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with open(args.model, "r") as f:
|
with open(args.model, "r") as f:
|
||||||
raw_model = json.load(f)
|
model = json.load(f)
|
||||||
if args.compile:
|
|
||||||
with open(args.compile, "w+") as f:
|
if args.subparser_name == 'compile':
|
||||||
json.dump(gptc.compile(raw_model), f)
|
print(json.dumps(gptc.compile(model)))
|
||||||
else:
|
else:
|
||||||
classifier = gptc.Classifier(raw_model)
|
classifier = gptc.Classifier(model)
|
||||||
|
|
||||||
if sys.stdin.isatty():
|
if sys.stdin.isatty():
|
||||||
text = input("Text to analyse: ")
|
text = input("Text to analyse: ")
|
||||||
else:
|
else:
|
||||||
text = sys.stdin.read()
|
text = sys.stdin.read()
|
||||||
if args.confidence:
|
|
||||||
print(json.dumps(classifier.confidence(text)))
|
if args.category:
|
||||||
else:
|
|
||||||
print(classifier.classify(text))
|
print(classifier.classify(text))
|
||||||
|
else:
|
||||||
|
print(json.dumps(classifier.confidence(text)))
|
||||||
|
|
|
@ -20,26 +20,11 @@ class Classifier:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, model):
|
def __init__(self, model):
|
||||||
try:
|
if model.get("__version__", 0) != 3:
|
||||||
model_version = model["__version__"]
|
|
||||||
except:
|
|
||||||
model_version = 1
|
|
||||||
|
|
||||||
if model_version == 3:
|
|
||||||
self.model = model
|
|
||||||
else:
|
|
||||||
# The model is an unsupported version
|
|
||||||
try:
|
|
||||||
raw_model = model["__raw__"]
|
|
||||||
except:
|
|
||||||
raise gptc.exceptions.UnsupportedModelError(
|
raise gptc.exceptions.UnsupportedModelError(
|
||||||
"this model is unsupported and does not contain a raw model for recompiling"
|
f"unsupported model version"
|
||||||
)
|
)
|
||||||
|
self.model = model
|
||||||
warnings.warn(
|
|
||||||
"model needed to be recompiled on-the-fly; please re-compile it and use the new compiled model in the future"
|
|
||||||
)
|
|
||||||
self.model = gptc.compiler.compile(raw_model)
|
|
||||||
|
|
||||||
def confidence(self, text):
|
def confidence(self, text):
|
||||||
"""Classify text with confidence.
|
"""Classify text with confidence.
|
||||||
|
@ -74,7 +59,8 @@ class Classifier:
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
probs = {
|
probs = {
|
||||||
model["__names__"][category]: value for category, value in probs.items()
|
model["__names__"][category]: value
|
||||||
|
for category, value in probs.items()
|
||||||
}
|
}
|
||||||
total = sum(probs.values())
|
total = sum(probs.values())
|
||||||
probs = {category: value / total for category, value in probs.items()}
|
probs = {category: value / total for category, value in probs.items()}
|
||||||
|
|
|
@ -39,9 +39,13 @@ def compile(raw_model):
|
||||||
categories_by_count[category] = {}
|
categories_by_count[category] = {}
|
||||||
for word in text:
|
for word in text:
|
||||||
try:
|
try:
|
||||||
categories_by_count[category][word] += 1 / len(categories[category])
|
categories_by_count[category][word] += 1 / len(
|
||||||
|
categories[category]
|
||||||
|
)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
categories_by_count[category][word] = 1 / len(categories[category])
|
categories_by_count[category][word] = 1 / len(
|
||||||
|
categories[category]
|
||||||
|
)
|
||||||
word_weights = {}
|
word_weights = {}
|
||||||
for category, words in categories_by_count.items():
|
for category, words in categories_by_count.items():
|
||||||
for word, value in words.items():
|
for word, value in words.items():
|
||||||
|
@ -55,11 +59,12 @@ def compile(raw_model):
|
||||||
total = sum(weights.values())
|
total = sum(weights.values())
|
||||||
model[word] = []
|
model[word] = []
|
||||||
for category in names:
|
for category in names:
|
||||||
model[word].append(round((weights.get(category, 0) / total) * 65535))
|
model[word].append(
|
||||||
|
round((weights.get(category, 0) / total) * 65535)
|
||||||
|
)
|
||||||
|
|
||||||
model["__names__"] = names
|
model["__names__"] = names
|
||||||
|
|
||||||
model["__version__"] = 3
|
model["__version__"] = 3
|
||||||
model["__raw__"] = raw_model
|
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -4,14 +4,13 @@ import sys
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
if len(sys.argv) != 2:
|
|
||||||
print("usage: pack.py <path>", file=sys.stderr)
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
paths = os.listdir(sys.argv[1])
|
def pack(directory, print_exceptions=True):
|
||||||
texts = {}
|
paths = os.listdir(directory)
|
||||||
|
texts = {}
|
||||||
|
exceptions = []
|
||||||
|
|
||||||
for path in paths:
|
for path in paths:
|
||||||
texts[path] = []
|
texts[path] = []
|
||||||
try:
|
try:
|
||||||
for file in os.listdir(os.path.join(sys.argv[1], path)):
|
for file in os.listdir(os.path.join(sys.argv[1], path)):
|
||||||
|
@ -19,13 +18,24 @@ for path in paths:
|
||||||
with open(os.path.join(sys.argv[1], path, file)) as f:
|
with open(os.path.join(sys.argv[1], path, file)) as f:
|
||||||
texts[path].append(f.read())
|
texts[path].append(f.read())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
exceptions.append((e,))
|
||||||
|
if print_exceptions:
|
||||||
print(e, file=sys.stderr)
|
print(e, file=sys.stderr)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
exceptions.append((e,))
|
||||||
|
if print_exceptions:
|
||||||
print(e, file=sys.stderr)
|
print(e, file=sys.stderr)
|
||||||
|
|
||||||
raw_model = []
|
raw_model = []
|
||||||
|
|
||||||
for category, cat_texts in texts.items():
|
for category, cat_texts in texts.items():
|
||||||
raw_model += [{"category": category, "text": i} for i in cat_texts]
|
raw_model += [{"category": category, "text": i} for i in cat_texts]
|
||||||
|
|
||||||
print(json.dumps(raw_model))
|
return raw_model, exceptions
|
||||||
|
|
||||||
|
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("usage: pack.py <path>", file=sys.stderr)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
print(json.dumps(pack(sys.argv[1])[0]))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user