Add ngrams
First git commit from new laptop!
This commit is contained in:
parent
c54c639b2f
commit
ce80647bbb
39
README.md
39
README.md
|
@ -8,31 +8,34 @@ GPTC provides both a CLI tool and a Python library.
|
|||
|
||||
### Classifying text
|
||||
|
||||
python -m gptc classify <compiled model file>
|
||||
python -m gptc classify [-n <max_ngram_length>] <compiled model file>
|
||||
|
||||
This will prompt for a string and classify it, then print (in JSON) a dict of
|
||||
the format `{category: probability, category:probability, ...}` to stdout.
|
||||
the format `{category: probability, category:probability, ...}` to stdout. (For
|
||||
information about `-n <max_ngram_length>`, see section "Ngrams.")
|
||||
|
||||
Alternatively, if you only need the most likely category, you can use this:
|
||||
|
||||
python -m gptc classify [-c|--category] <compiled model file>
|
||||
python -m gptc classify [-n <max_ngram_length>] <-c|--category> <compiled model file>
|
||||
|
||||
This will prompt for a string and classify it, outputting the category on
|
||||
stdout (or "None" if it cannot determine anything).
|
||||
|
||||
### Compiling models
|
||||
|
||||
python -m gptc compile <raw model file>
|
||||
python -m gptc compile [-n <max_ngram_length>] <raw model file>
|
||||
|
||||
This will print the compiled model in JSON to stdout.
|
||||
|
||||
## Library
|
||||
|
||||
### `gptc.Classifier(model)`
|
||||
### `gptc.Classifier(model, max_ngram_length=1)`
|
||||
|
||||
Create a `Classifier` object using the given *compiled* model (as a dict, not
|
||||
JSON).
|
||||
|
||||
For information about `max_ngram_length`, see section "Ngrams."
|
||||
|
||||
#### `Classifier.confidence(text)`
|
||||
|
||||
Classify `text`. Returns a dict of the format `{category: probability,
|
||||
|
@ -43,10 +46,32 @@ category:probability, ...}`
|
|||
Classify `text`. Returns the category into which the text is placed (as a
|
||||
string), or `None` when it cannot classify the text.
|
||||
|
||||
### `gptc.compile(raw_model)`
|
||||
### `gptc.compile(raw_model, max_ngram_length=1)`
|
||||
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
||||
dict).
|
||||
|
||||
For information about `max_ngram_length`, see section "Ngrams."
|
||||
|
||||
## Ngrams
|
||||
|
||||
GPTC optionally supports using ngrams to improve classification accuracy. They
|
||||
are disabled by default (maximum length set to 1) for performance and
|
||||
compatibility reasons. Enabling them significantly increases the time required
|
||||
both for compilation and classification. The effect seems more significant for
|
||||
compilation than for classification. Compiled models are also much larger when
|
||||
ngrams are enabled. Larger maximum ngram lengths will result in slower
|
||||
performance and larger files. It is a good idea to experiment with different
|
||||
values and use the highest one at which GPTC is fast enough and models are
|
||||
small enough for your needs.
|
||||
|
||||
Once a model is compiled at a certain maximum ngram length, it cannot be used
|
||||
for classification with a higher value. If you instantiate a `Classifier` with
|
||||
a model compiled with a lower `max_ngram_length`, the value will be silently
|
||||
reduced to the one used when compiling the model.
|
||||
|
||||
Models compiled with older versions of GPTC which did not support ngrams are
|
||||
handled the same way as models compiled with `max_ngram_length=1`.
|
||||
|
||||
## Model format
|
||||
|
||||
This section explains the raw model format, which is how you should create and
|
||||
|
@ -73,6 +98,8 @@ Mark Twain and those written by William Shakespeare, is available in `models`.
|
|||
The raw model is in `models/raw.json`; the compiled model is in
|
||||
`models/compiled.json`.
|
||||
|
||||
The example model was compiled with `max_ngram_length=10`.
|
||||
|
||||
## Benchmark
|
||||
|
||||
A benchmark script is available for comparing performance of GPTC between
|
||||
|
|
|
@ -3,6 +3,7 @@ import gptc
|
|||
import json
|
||||
import sys
|
||||
|
||||
max_ngram_length = 10
|
||||
compile_iterations = 100
|
||||
classify_iterations = 10000
|
||||
|
||||
|
@ -12,9 +13,8 @@ with open("models/raw.json") as f:
|
|||
with open("models/benchmark_text.txt") as f:
|
||||
text = f.read()
|
||||
|
||||
classifier = gptc.Classifier(gptc.compile(raw_model))
|
||||
|
||||
print("Benchmarking GPTC on Python", sys.version)
|
||||
print("Maximum ngram length:", max_ngram_length)
|
||||
|
||||
print(
|
||||
"Average compilation time over",
|
||||
|
@ -23,7 +23,7 @@ print(
|
|||
round(
|
||||
1000000
|
||||
* timeit.timeit(
|
||||
"gptc.compile(raw_model)",
|
||||
"gptc.compile(raw_model, max_ngram_length)",
|
||||
number=compile_iterations,
|
||||
globals=globals(),
|
||||
)
|
||||
|
@ -33,6 +33,7 @@ print(
|
|||
)
|
||||
|
||||
|
||||
classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
|
||||
print(
|
||||
"Average classification time over",
|
||||
classify_iterations,
|
||||
|
@ -48,3 +49,4 @@ print(
|
|||
),
|
||||
"microseconds",
|
||||
)
|
||||
print("--- benchmark complete ---")
|
||||
|
|
|
@ -6,14 +6,18 @@ import json
|
|||
import sys
|
||||
import gptc
|
||||
|
||||
parser = argparse.ArgumentParser(description="General Purpose Text Classifier", prog='gptc')
|
||||
parser = argparse.ArgumentParser(
|
||||
description="General Purpose Text Classifier", prog="gptc"
|
||||
)
|
||||
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
|
||||
|
||||
compile_parser = subparsers.add_parser('compile', help='compile a raw model')
|
||||
compile_parser = subparsers.add_parser("compile", help="compile a raw model")
|
||||
compile_parser.add_argument("model", help="raw model to compile")
|
||||
compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
|
||||
|
||||
classify_parser = subparsers.add_parser('classify', help='classify text')
|
||||
classify_parser = subparsers.add_parser("classify", help="classify text")
|
||||
classify_parser.add_argument("model", help="compiled model to use")
|
||||
classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
|
||||
group = classify_parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
"-j",
|
||||
|
@ -33,10 +37,10 @@ args = parser.parse_args()
|
|||
with open(args.model, "r") as f:
|
||||
model = json.load(f)
|
||||
|
||||
if args.subparser_name == 'compile':
|
||||
print(json.dumps(gptc.compile(model)))
|
||||
if args.subparser_name == "compile":
|
||||
print(json.dumps(gptc.compile(model, args.max_ngram_length)))
|
||||
else:
|
||||
classifier = gptc.Classifier(model)
|
||||
classifier = gptc.Classifier(model, args.max_ngram_length)
|
||||
|
||||
if sys.stdin.isatty():
|
||||
text = input("Text to analyse: ")
|
||||
|
|
|
@ -12,6 +12,11 @@ class Classifier:
|
|||
model : dict
|
||||
A compiled GPTC model.
|
||||
|
||||
max_ngram_length : int
|
||||
The maximum ngram length to use when tokenizing input. If this is
|
||||
greater than the value used when the model was compiled, it will be
|
||||
silently lowered to that value.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
model : dict
|
||||
|
@ -19,12 +24,15 @@ class Classifier:
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self, model):
|
||||
def __init__(self, model, max_ngram_length=1):
|
||||
if model.get("__version__", 0) != 3:
|
||||
raise gptc.exceptions.UnsupportedModelError(
|
||||
f"unsupported model version"
|
||||
)
|
||||
self.model = model
|
||||
self.max_ngram_length = min(
|
||||
max_ngram_length, model.get("__ngrams__", 1)
|
||||
)
|
||||
|
||||
def confidence(self, text):
|
||||
"""Classify text with confidence.
|
||||
|
@ -44,7 +52,7 @@ class Classifier:
|
|||
|
||||
model = self.model
|
||||
|
||||
text = gptc.tokenizer.tokenize(text)
|
||||
text = gptc.tokenizer.tokenize(text, self.max_ngram_length)
|
||||
probs = {}
|
||||
for word in text:
|
||||
try:
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import gptc.tokenizer
|
||||
|
||||
|
||||
def compile(raw_model):
|
||||
def compile(raw_model, max_ngram_length=1):
|
||||
"""Compile a raw model.
|
||||
|
||||
Parameters
|
||||
|
@ -11,6 +11,9 @@ def compile(raw_model):
|
|||
raw_model : list of dict
|
||||
A raw GPTC model.
|
||||
|
||||
max_ngram_length : int
|
||||
Maximum ngram lenght to compile with.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
|
@ -21,7 +24,7 @@ def compile(raw_model):
|
|||
categories = {}
|
||||
|
||||
for portion in raw_model:
|
||||
text = gptc.tokenizer.tokenize(portion["text"])
|
||||
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
||||
category = portion["category"]
|
||||
try:
|
||||
categories[category] += text
|
||||
|
@ -64,7 +67,7 @@ def compile(raw_model):
|
|||
)
|
||||
|
||||
model["__names__"] = names
|
||||
|
||||
model["__ngrams__"] = max_ngram_length
|
||||
model["__version__"] = 3
|
||||
|
||||
return model
|
||||
|
|
|
@ -1,14 +1,23 @@
|
|||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
def tokenize(text, max_ngram_length=1):
|
||||
"""Convert a string to a list of lemmas."""
|
||||
out = [""]
|
||||
tokens = [""]
|
||||
|
||||
for char in text.lower():
|
||||
if char.isalpha() or char == "'":
|
||||
out[-1] += char
|
||||
elif out[-1] != "":
|
||||
out.append("")
|
||||
tokens[-1] += char
|
||||
elif tokens[-1] != "":
|
||||
tokens.append("")
|
||||
|
||||
return [string for string in out if string]
|
||||
tokens = [string for string in tokens if string]
|
||||
|
||||
if max_ngram_length == 1:
|
||||
return tokens
|
||||
else:
|
||||
ngrams = []
|
||||
for ngram_length in range(1, max_ngram_length + 1):
|
||||
for index in range(len(tokens) + 1 - ngram_length):
|
||||
ngrams.append(" ".join(tokens[index : index + ngram_length]))
|
||||
return ngrams
|
||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user