Add ngrams

First git commit from new laptop!
This commit is contained in:
Samuel Sloniker 2022-07-13 11:45:17 -07:00
parent c54c639b2f
commit ce80647bbb
7 changed files with 80 additions and 27 deletions

View File

@ -8,31 +8,34 @@ GPTC provides both a CLI tool and a Python library.
### Classifying text
python -m gptc classify <compiled model file>
python -m gptc classify [-n <max_ngram_length>] <compiled model file>
This will prompt for a string and classify it, then print (in JSON) a dict of
the format `{category: probability, category:probability, ...}` to stdout.
the format `{category: probability, category:probability, ...}` to stdout. (For
information about `-n <max_ngram_length>`, see section "Ngrams.")
Alternatively, if you only need the most likely category, you can use this:
python -m gptc classify [-c|--category] <compiled model file>
python -m gptc classify [-n <max_ngram_length>] <-c|--category> <compiled model file>
This will prompt for a string and classify it, outputting the category on
stdout (or "None" if it cannot determine anything).
### Compiling models
python -m gptc compile <raw model file>
python -m gptc compile [-n <max_ngram_length>] <raw model file>
This will print the compiled model in JSON to stdout.
## Library
### `gptc.Classifier(model)`
### `gptc.Classifier(model, max_ngram_length=1)`
Create a `Classifier` object using the given *compiled* model (as a dict, not
JSON).
For information about `max_ngram_length`, see section "Ngrams."
#### `Classifier.confidence(text)`
Classify `text`. Returns a dict of the format `{category: probability,
@ -43,10 +46,32 @@ category:probability, ...}`
Classify `text`. Returns the category into which the text is placed (as a
string), or `None` when it cannot classify the text.
### `gptc.compile(raw_model)`
### `gptc.compile(raw_model, max_ngram_length=1)`
Compile a raw model (as a list, not JSON) and return the compiled model (as a
dict).
For information about `max_ngram_length`, see section "Ngrams."
## Ngrams
GPTC optionally supports using ngrams to improve classification accuracy. They
are disabled by default (maximum length set to 1) for performance and
compatibility reasons. Enabling them significantly increases the time required
both for compilation and classification. The effect seems more significant for
compilation than for classification. Compiled models are also much larger when
ngrams are enabled. Larger maximum ngram lengths will result in slower
performance and larger files. It is a good idea to experiment with different
values and use the highest one at which GPTC is fast enough and models are
small enough for your needs.
Once a model is compiled at a certain maximum ngram length, it cannot be used
for classification with a higher value. If you instantiate a `Classifier` with
a model compiled with a lower `max_ngram_length`, the value will be silently
reduced to the one used when compiling the model.
Models compiled with older versions of GPTC which did not support ngrams are
handled the same way as models compiled with `max_ngram_length=1`.
## Model format
This section explains the raw model format, which is how you should create and
@ -73,6 +98,8 @@ Mark Twain and those written by William Shakespeare, is available in `models`.
The raw model is in `models/raw.json`; the compiled model is in
`models/compiled.json`.
The example model was compiled with `max_ngram_length=10`.
## Benchmark
A benchmark script is available for comparing performance of GPTC between

View File

@ -3,6 +3,7 @@ import gptc
import json
import sys
max_ngram_length = 10
compile_iterations = 100
classify_iterations = 10000
@ -12,9 +13,8 @@ with open("models/raw.json") as f:
with open("models/benchmark_text.txt") as f:
text = f.read()
classifier = gptc.Classifier(gptc.compile(raw_model))
print("Benchmarking GPTC on Python", sys.version)
print("Maximum ngram length:", max_ngram_length)
print(
"Average compilation time over",
@ -23,7 +23,7 @@ print(
round(
1000000
* timeit.timeit(
"gptc.compile(raw_model)",
"gptc.compile(raw_model, max_ngram_length)",
number=compile_iterations,
globals=globals(),
)
@ -33,6 +33,7 @@ print(
)
classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
print(
"Average classification time over",
classify_iterations,
@ -48,3 +49,4 @@ print(
),
"microseconds",
)
print("--- benchmark complete ---")

View File

@ -6,14 +6,18 @@ import json
import sys
import gptc
parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
parser = argparse.ArgumentParser(
description="General Purpose Text Classifier", prog="gptc"
)
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
compile_parser = subparsers.add_parser('compile', help='compile a raw model')
compile_parser = subparsers.add_parser("compile", help="compile a raw model")
compile_parser.add_argument("model", help="raw model to compile")
compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
classify_parser = subparsers.add_parser('classify', help='classify text')
classify_parser = subparsers.add_parser("classify", help="classify text")
classify_parser.add_argument("model", help="compiled model to use")
classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
group = classify_parser.add_mutually_exclusive_group()
group.add_argument(
"-j",
@ -33,10 +37,10 @@ args = parser.parse_args()
with open(args.model, "r") as f:
model = json.load(f)
if args.subparser_name == 'compile':
print(json.dumps(gptc.compile(model)))
if args.subparser_name == "compile":
print(json.dumps(gptc.compile(model, args.max_ngram_length)))
else:
classifier = gptc.Classifier(model)
classifier = gptc.Classifier(model, args.max_ngram_length)
if sys.stdin.isatty():
text = input("Text to analyse: ")

View File

@ -12,6 +12,11 @@ class Classifier:
model : dict
A compiled GPTC model.
max_ngram_length : int
The maximum ngram length to use when tokenizing input. If this is
greater than the value used when the model was compiled, it will be
silently lowered to that value.
Attributes
----------
model : dict
@ -19,12 +24,15 @@ class Classifier:
"""
def __init__(self, model):
def __init__(self, model, max_ngram_length=1):
if model.get("__version__", 0) != 3:
raise gptc.exceptions.UnsupportedModelError(
f"unsupported model version"
)
self.model = model
self.max_ngram_length = min(
max_ngram_length, model.get("__ngrams__", 1)
)
def confidence(self, text):
"""Classify text with confidence.
@ -44,7 +52,7 @@ class Classifier:
model = self.model
text = gptc.tokenizer.tokenize(text)
text = gptc.tokenizer.tokenize(text, self.max_ngram_length)
probs = {}
for word in text:
try:

View File

@ -3,7 +3,7 @@
import gptc.tokenizer
def compile(raw_model):
def compile(raw_model, max_ngram_length=1):
"""Compile a raw model.
Parameters
@ -11,6 +11,9 @@ def compile(raw_model):
raw_model : list of dict
A raw GPTC model.
max_ngram_length : int
Maximum ngram lenght to compile with.
Returns
-------
dict
@ -21,7 +24,7 @@ def compile(raw_model):
categories = {}
for portion in raw_model:
text = gptc.tokenizer.tokenize(portion["text"])
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
category = portion["category"]
try:
categories[category] += text
@ -64,7 +67,7 @@ def compile(raw_model):
)
model["__names__"] = names
model["__ngrams__"] = max_ngram_length
model["__version__"] = 3
return model

View File

@ -1,14 +1,23 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
def tokenize(text):
def tokenize(text, max_ngram_length=1):
"""Convert a string to a list of lemmas."""
out = [""]
tokens = [""]
for char in text.lower():
if char.isalpha() or char == "'":
out[-1] += char
elif out[-1] != "":
out.append("")
tokens[-1] += char
elif tokens[-1] != "":
tokens.append("")
return [string for string in out if string]
tokens = [string for string in tokens if string]
if max_ngram_length == 1:
return tokens
else:
ngrams = []
for ngram_length in range(1, max_ngram_length + 1):
for index in range(len(tokens) + 1 - ngram_length):
ngrams.append(" ".join(tokens[index : index + ngram_length]))
return ngrams

File diff suppressed because one or more lines are too long