Add ngrams
First git commit from new laptop!
This commit is contained in:
parent
c54c639b2f
commit
ce80647bbb
39
README.md
39
README.md
|
@ -8,31 +8,34 @@ GPTC provides both a CLI tool and a Python library.
|
||||||
|
|
||||||
### Classifying text
|
### Classifying text
|
||||||
|
|
||||||
python -m gptc classify <compiled model file>
|
python -m gptc classify [-n <max_ngram_length>] <compiled model file>
|
||||||
|
|
||||||
This will prompt for a string and classify it, then print (in JSON) a dict of
|
This will prompt for a string and classify it, then print (in JSON) a dict of
|
||||||
the format `{category: probability, category:probability, ...}` to stdout.
|
the format `{category: probability, category:probability, ...}` to stdout. (For
|
||||||
|
information about `-n <max_ngram_length>`, see section "Ngrams.")
|
||||||
|
|
||||||
Alternatively, if you only need the most likely category, you can use this:
|
Alternatively, if you only need the most likely category, you can use this:
|
||||||
|
|
||||||
python -m gptc classify [-c|--category] <compiled model file>
|
python -m gptc classify [-n <max_ngram_length>] <-c|--category> <compiled model file>
|
||||||
|
|
||||||
This will prompt for a string and classify it, outputting the category on
|
This will prompt for a string and classify it, outputting the category on
|
||||||
stdout (or "None" if it cannot determine anything).
|
stdout (or "None" if it cannot determine anything).
|
||||||
|
|
||||||
### Compiling models
|
### Compiling models
|
||||||
|
|
||||||
python -m gptc compile <raw model file>
|
python -m gptc compile [-n <max_ngram_length>] <raw model file>
|
||||||
|
|
||||||
This will print the compiled model in JSON to stdout.
|
This will print the compiled model in JSON to stdout.
|
||||||
|
|
||||||
## Library
|
## Library
|
||||||
|
|
||||||
### `gptc.Classifier(model)`
|
### `gptc.Classifier(model, max_ngram_length=1)`
|
||||||
|
|
||||||
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
||||||
JSON).
|
JSON).
|
||||||
|
|
||||||
|
For information about `max_ngram_length`, see section "Ngrams."
|
||||||
|
|
||||||
#### `Classifier.confidence(text)`
|
#### `Classifier.confidence(text)`
|
||||||
|
|
||||||
Classify `text`. Returns a dict of the format `{category: probability,
|
Classify `text`. Returns a dict of the format `{category: probability,
|
||||||
|
@ -43,10 +46,32 @@ category:probability, ...}`
|
||||||
Classify `text`. Returns the category into which the text is placed (as a
|
Classify `text`. Returns the category into which the text is placed (as a
|
||||||
string), or `None` when it cannot classify the text.
|
string), or `None` when it cannot classify the text.
|
||||||
|
|
||||||
### `gptc.compile(raw_model)`
|
### `gptc.compile(raw_model, max_ngram_length=1)`
|
||||||
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
||||||
dict).
|
dict).
|
||||||
|
|
||||||
|
For information about `max_ngram_length`, see section "Ngrams."
|
||||||
|
|
||||||
|
## Ngrams
|
||||||
|
|
||||||
|
GPTC optionally supports using ngrams to improve classification accuracy. They
|
||||||
|
are disabled by default (maximum length set to 1) for performance and
|
||||||
|
compatibility reasons. Enabling them significantly increases the time required
|
||||||
|
both for compilation and classification. The effect seems more significant for
|
||||||
|
compilation than for classification. Compiled models are also much larger when
|
||||||
|
ngrams are enabled. Larger maximum ngram lengths will result in slower
|
||||||
|
performance and larger files. It is a good idea to experiment with different
|
||||||
|
values and use the highest one at which GPTC is fast enough and models are
|
||||||
|
small enough for your needs.
|
||||||
|
|
||||||
|
Once a model is compiled at a certain maximum ngram length, it cannot be used
|
||||||
|
for classification with a higher value. If you instantiate a `Classifier` with
|
||||||
|
a model compiled with a lower `max_ngram_length`, the value will be silently
|
||||||
|
reduced to the one used when compiling the model.
|
||||||
|
|
||||||
|
Models compiled with older versions of GPTC which did not support ngrams are
|
||||||
|
handled the same way as models compiled with `max_ngram_length=1`.
|
||||||
|
|
||||||
## Model format
|
## Model format
|
||||||
|
|
||||||
This section explains the raw model format, which is how you should create and
|
This section explains the raw model format, which is how you should create and
|
||||||
|
@ -73,6 +98,8 @@ Mark Twain and those written by William Shakespeare, is available in `models`.
|
||||||
The raw model is in `models/raw.json`; the compiled model is in
|
The raw model is in `models/raw.json`; the compiled model is in
|
||||||
`models/compiled.json`.
|
`models/compiled.json`.
|
||||||
|
|
||||||
|
The example model was compiled with `max_ngram_length=10`.
|
||||||
|
|
||||||
## Benchmark
|
## Benchmark
|
||||||
|
|
||||||
A benchmark script is available for comparing performance of GPTC between
|
A benchmark script is available for comparing performance of GPTC between
|
||||||
|
|
|
@ -3,6 +3,7 @@ import gptc
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
max_ngram_length = 10
|
||||||
compile_iterations = 100
|
compile_iterations = 100
|
||||||
classify_iterations = 10000
|
classify_iterations = 10000
|
||||||
|
|
||||||
|
@ -12,9 +13,8 @@ with open("models/raw.json") as f:
|
||||||
with open("models/benchmark_text.txt") as f:
|
with open("models/benchmark_text.txt") as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
classifier = gptc.Classifier(gptc.compile(raw_model))
|
|
||||||
|
|
||||||
print("Benchmarking GPTC on Python", sys.version)
|
print("Benchmarking GPTC on Python", sys.version)
|
||||||
|
print("Maximum ngram length:", max_ngram_length)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
"Average compilation time over",
|
"Average compilation time over",
|
||||||
|
@ -23,7 +23,7 @@ print(
|
||||||
round(
|
round(
|
||||||
1000000
|
1000000
|
||||||
* timeit.timeit(
|
* timeit.timeit(
|
||||||
"gptc.compile(raw_model)",
|
"gptc.compile(raw_model, max_ngram_length)",
|
||||||
number=compile_iterations,
|
number=compile_iterations,
|
||||||
globals=globals(),
|
globals=globals(),
|
||||||
)
|
)
|
||||||
|
@ -33,6 +33,7 @@ print(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
|
||||||
print(
|
print(
|
||||||
"Average classification time over",
|
"Average classification time over",
|
||||||
classify_iterations,
|
classify_iterations,
|
||||||
|
@ -48,3 +49,4 @@ print(
|
||||||
),
|
),
|
||||||
"microseconds",
|
"microseconds",
|
||||||
)
|
)
|
||||||
|
print("--- benchmark complete ---")
|
||||||
|
|
|
@ -6,14 +6,18 @@ import json
|
||||||
import sys
|
import sys
|
||||||
import gptc
|
import gptc
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
|
parser = argparse.ArgumentParser(
|
||||||
|
description="General Purpose Text Classifier", prog="gptc"
|
||||||
|
)
|
||||||
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
|
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
|
||||||
|
|
||||||
compile_parser = subparsers.add_parser('compile', help='compile a raw model')
|
compile_parser = subparsers.add_parser("compile", help="compile a raw model")
|
||||||
compile_parser.add_argument("model", help="raw model to compile")
|
compile_parser.add_argument("model", help="raw model to compile")
|
||||||
|
compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
|
||||||
|
|
||||||
classify_parser = subparsers.add_parser('classify', help='classify text')
|
classify_parser = subparsers.add_parser("classify", help="classify text")
|
||||||
classify_parser.add_argument("model", help="compiled model to use")
|
classify_parser.add_argument("model", help="compiled model to use")
|
||||||
|
classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
|
||||||
group = classify_parser.add_mutually_exclusive_group()
|
group = classify_parser.add_mutually_exclusive_group()
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"-j",
|
"-j",
|
||||||
|
@ -33,10 +37,10 @@ args = parser.parse_args()
|
||||||
with open(args.model, "r") as f:
|
with open(args.model, "r") as f:
|
||||||
model = json.load(f)
|
model = json.load(f)
|
||||||
|
|
||||||
if args.subparser_name == 'compile':
|
if args.subparser_name == "compile":
|
||||||
print(json.dumps(gptc.compile(model)))
|
print(json.dumps(gptc.compile(model, args.max_ngram_length)))
|
||||||
else:
|
else:
|
||||||
classifier = gptc.Classifier(model)
|
classifier = gptc.Classifier(model, args.max_ngram_length)
|
||||||
|
|
||||||
if sys.stdin.isatty():
|
if sys.stdin.isatty():
|
||||||
text = input("Text to analyse: ")
|
text = input("Text to analyse: ")
|
||||||
|
|
|
@ -12,6 +12,11 @@ class Classifier:
|
||||||
model : dict
|
model : dict
|
||||||
A compiled GPTC model.
|
A compiled GPTC model.
|
||||||
|
|
||||||
|
max_ngram_length : int
|
||||||
|
The maximum ngram length to use when tokenizing input. If this is
|
||||||
|
greater than the value used when the model was compiled, it will be
|
||||||
|
silently lowered to that value.
|
||||||
|
|
||||||
Attributes
|
Attributes
|
||||||
----------
|
----------
|
||||||
model : dict
|
model : dict
|
||||||
|
@ -19,12 +24,15 @@ class Classifier:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, model):
|
def __init__(self, model, max_ngram_length=1):
|
||||||
if model.get("__version__", 0) != 3:
|
if model.get("__version__", 0) != 3:
|
||||||
raise gptc.exceptions.UnsupportedModelError(
|
raise gptc.exceptions.UnsupportedModelError(
|
||||||
f"unsupported model version"
|
f"unsupported model version"
|
||||||
)
|
)
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.max_ngram_length = min(
|
||||||
|
max_ngram_length, model.get("__ngrams__", 1)
|
||||||
|
)
|
||||||
|
|
||||||
def confidence(self, text):
|
def confidence(self, text):
|
||||||
"""Classify text with confidence.
|
"""Classify text with confidence.
|
||||||
|
@ -44,7 +52,7 @@ class Classifier:
|
||||||
|
|
||||||
model = self.model
|
model = self.model
|
||||||
|
|
||||||
text = gptc.tokenizer.tokenize(text)
|
text = gptc.tokenizer.tokenize(text, self.max_ngram_length)
|
||||||
probs = {}
|
probs = {}
|
||||||
for word in text:
|
for word in text:
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import gptc.tokenizer
|
import gptc.tokenizer
|
||||||
|
|
||||||
|
|
||||||
def compile(raw_model):
|
def compile(raw_model, max_ngram_length=1):
|
||||||
"""Compile a raw model.
|
"""Compile a raw model.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
@ -11,6 +11,9 @@ def compile(raw_model):
|
||||||
raw_model : list of dict
|
raw_model : list of dict
|
||||||
A raw GPTC model.
|
A raw GPTC model.
|
||||||
|
|
||||||
|
max_ngram_length : int
|
||||||
|
Maximum ngram lenght to compile with.
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
dict
|
dict
|
||||||
|
@ -21,7 +24,7 @@ def compile(raw_model):
|
||||||
categories = {}
|
categories = {}
|
||||||
|
|
||||||
for portion in raw_model:
|
for portion in raw_model:
|
||||||
text = gptc.tokenizer.tokenize(portion["text"])
|
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
||||||
category = portion["category"]
|
category = portion["category"]
|
||||||
try:
|
try:
|
||||||
categories[category] += text
|
categories[category] += text
|
||||||
|
@ -64,7 +67,7 @@ def compile(raw_model):
|
||||||
)
|
)
|
||||||
|
|
||||||
model["__names__"] = names
|
model["__names__"] = names
|
||||||
|
model["__ngrams__"] = max_ngram_length
|
||||||
model["__version__"] = 3
|
model["__version__"] = 3
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
|
@ -1,14 +1,23 @@
|
||||||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text):
|
def tokenize(text, max_ngram_length=1):
|
||||||
"""Convert a string to a list of lemmas."""
|
"""Convert a string to a list of lemmas."""
|
||||||
out = [""]
|
tokens = [""]
|
||||||
|
|
||||||
for char in text.lower():
|
for char in text.lower():
|
||||||
if char.isalpha() or char == "'":
|
if char.isalpha() or char == "'":
|
||||||
out[-1] += char
|
tokens[-1] += char
|
||||||
elif out[-1] != "":
|
elif tokens[-1] != "":
|
||||||
out.append("")
|
tokens.append("")
|
||||||
|
|
||||||
return [string for string in out if string]
|
tokens = [string for string in tokens if string]
|
||||||
|
|
||||||
|
if max_ngram_length == 1:
|
||||||
|
return tokens
|
||||||
|
else:
|
||||||
|
ngrams = []
|
||||||
|
for ngram_length in range(1, max_ngram_length + 1):
|
||||||
|
for index in range(len(tokens) + 1 - ngram_length):
|
||||||
|
ngrams.append(" ".join(tokens[index : index + ngram_length]))
|
||||||
|
return ngrams
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user