Compare commits
1 Commits
f38f4ca801
...
8a1cb6105e
Author | SHA1 | Date | |
---|---|---|---|
8a1cb6105e |
|
@ -112,6 +112,13 @@ GPTC.
|
||||||
|
|
||||||
See `models/unpacked/` for an example of the format.
|
See `models/unpacked/` for an example of the format.
|
||||||
|
|
||||||
|
### `gptc.Classifier(model, max_ngram_length=1)`
|
||||||
|
|
||||||
|
`Classifier` objects are deprecated starting with GPTC 3.1.0, and will be
|
||||||
|
removed in 4.0.0. See [the README from
|
||||||
|
3.0.2](https://git.kj7rrv.com/kj7rrv/gptc/src/tag/v3.0.1/README.md) if you need
|
||||||
|
documentation.
|
||||||
|
|
||||||
## Ngrams
|
## Ngrams
|
||||||
|
|
||||||
GPTC optionally supports using ngrams to improve classification accuracy. They
|
GPTC optionally supports using ngrams to improve classification accuracy. They
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
"""General-Purpose Text Classifier"""
|
"""General-Purpose Text Classifier"""
|
||||||
|
|
||||||
from gptc.compiler import compile as compile
|
from gptc.compiler import compile as compile
|
||||||
|
from gptc.classifier import Classifier as Classifier
|
||||||
from gptc.pack import pack as pack
|
from gptc.pack import pack as pack
|
||||||
from gptc.model import Model as Model, deserialize as deserialize
|
from gptc.model import Model as Model, deserialize as deserialize
|
||||||
from gptc.tokenizer import normalize as normalize
|
from gptc.tokenizer import normalize as normalize
|
||||||
|
|
|
@ -44,6 +44,19 @@ def main() -> None:
|
||||||
type=int,
|
type=int,
|
||||||
default=1,
|
default=1,
|
||||||
)
|
)
|
||||||
|
group = classify_parser.add_mutually_exclusive_group()
|
||||||
|
group.add_argument(
|
||||||
|
"-j",
|
||||||
|
"--json",
|
||||||
|
help="output confidence dict as JSON (default)",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"-c",
|
||||||
|
"--category",
|
||||||
|
help="output most likely category or `None`",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
|
||||||
check_parser = subparsers.add_parser(
|
check_parser = subparsers.add_parser(
|
||||||
"check", help="check one word or ngram in model"
|
"check", help="check one word or ngram in model"
|
||||||
|
@ -75,7 +88,12 @@ def main() -> None:
|
||||||
else:
|
else:
|
||||||
text = sys.stdin.read()
|
text = sys.stdin.read()
|
||||||
|
|
||||||
print(json.dumps(model.confidence(text, args.max_ngram_length)))
|
if args.category:
|
||||||
|
classifier = gptc.Classifier(model, args.max_ngram_length)
|
||||||
|
print(classifier.classify(text))
|
||||||
|
else:
|
||||||
|
probabilities = model.confidence(text, args.max_ngram_length)
|
||||||
|
print(json.dumps(probabilities))
|
||||||
elif args.subparser_name == "check":
|
elif args.subparser_name == "check":
|
||||||
with open(args.model, "rb") as f:
|
with open(args.model, "rb") as f:
|
||||||
model = gptc.deserialize(f)
|
model = gptc.deserialize(f)
|
||||||
|
|
68
gptc/classifier.py
Executable file
68
gptc/classifier.py
Executable file
|
@ -0,0 +1,68 @@
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
import gptc.model
|
||||||
|
from typing import Dict, Union
|
||||||
|
|
||||||
|
|
||||||
|
class Classifier:
|
||||||
|
"""A text classifier.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model : dict
|
||||||
|
A compiled GPTC model.
|
||||||
|
|
||||||
|
max_ngram_length : int
|
||||||
|
The maximum ngram length to use when tokenizing input. If this is
|
||||||
|
greater than the value used when the model was compiled, it will be
|
||||||
|
silently lowered to that value.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
model : dict
|
||||||
|
The model used.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model: gptc.model.Model, max_ngram_length: int = 1):
|
||||||
|
self.model = model
|
||||||
|
model_ngrams = model.max_ngram_length
|
||||||
|
self.max_ngram_length = min(max_ngram_length, model_ngrams)
|
||||||
|
|
||||||
|
def confidence(self, text: str) -> Dict[str, float]:
|
||||||
|
"""Classify text with confidence.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : str
|
||||||
|
The text to classify
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict
|
||||||
|
{category:probability, category:probability...} or {} if no words
|
||||||
|
matching any categories in the model were found
|
||||||
|
|
||||||
|
"""
|
||||||
|
return self.model.confidence(text, self.max_ngram_length)
|
||||||
|
|
||||||
|
def classify(self, text: str) -> Union[str, None]:
|
||||||
|
"""Classify text.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
text : str
|
||||||
|
The text to classify
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str or None
|
||||||
|
The most likely category, or None if no words matching any
|
||||||
|
category in the model were found.
|
||||||
|
|
||||||
|
"""
|
||||||
|
probs: Dict[str, float] = self.confidence(text)
|
||||||
|
try:
|
||||||
|
return sorted(probs.items(), key=lambda x: x[1])[-1][0]
|
||||||
|
except IndexError:
|
||||||
|
return None
|
109
gptc/compiler.py
109
gptc/compiler.py
|
@ -2,65 +2,7 @@
|
||||||
|
|
||||||
import gptc.tokenizer
|
import gptc.tokenizer
|
||||||
import gptc.model
|
import gptc.model
|
||||||
from typing import Iterable, Mapping, List, Dict, Union, Tuple
|
from typing import Iterable, Mapping, List, Dict, Union
|
||||||
|
|
||||||
|
|
||||||
def _count_words(
|
|
||||||
raw_model: Iterable[Mapping[str, str]],
|
|
||||||
max_ngram_length: int,
|
|
||||||
hash_algorithm: str,
|
|
||||||
) -> Tuple[Dict[int, Dict[str, int]], Dict[str, int], List[str]]:
|
|
||||||
word_counts: Dict[int, Dict[str, int]] = {}
|
|
||||||
category_lengths: Dict[str, int] = {}
|
|
||||||
names: List[str] = []
|
|
||||||
|
|
||||||
for portion in raw_model:
|
|
||||||
text = gptc.tokenizer.hash(
|
|
||||||
gptc.tokenizer.tokenize(portion["text"], max_ngram_length),
|
|
||||||
hash_algorithm,
|
|
||||||
)
|
|
||||||
category = portion["category"]
|
|
||||||
|
|
||||||
if not category in names:
|
|
||||||
names.append(category)
|
|
||||||
|
|
||||||
category_lengths[category] = category_lengths.get(category, 0) + len(
|
|
||||||
text
|
|
||||||
)
|
|
||||||
|
|
||||||
for word in text:
|
|
||||||
if word in word_counts:
|
|
||||||
try:
|
|
||||||
word_counts[word][category] += 1
|
|
||||||
except KeyError:
|
|
||||||
word_counts[word][category] = 1
|
|
||||||
else:
|
|
||||||
word_counts[word] = {category: 1}
|
|
||||||
|
|
||||||
return word_counts, category_lengths, names
|
|
||||||
|
|
||||||
|
|
||||||
def _get_weights(
|
|
||||||
min_count: int,
|
|
||||||
word_counts: Dict[int, Dict[str, int]],
|
|
||||||
category_lengths: Dict[str, int],
|
|
||||||
names: List[str],
|
|
||||||
) -> Dict[int, List[int]]:
|
|
||||||
model: Dict[int, List[int]] = {}
|
|
||||||
for word, counts in word_counts.items():
|
|
||||||
if sum(counts.values()) >= min_count:
|
|
||||||
weights = {
|
|
||||||
category: value / category_lengths[category]
|
|
||||||
for category, value in counts.items()
|
|
||||||
}
|
|
||||||
total = sum(weights.values())
|
|
||||||
new_weights: List[int] = []
|
|
||||||
for category in names:
|
|
||||||
new_weights.append(
|
|
||||||
round((weights.get(category, 0) / total) * 65535)
|
|
||||||
)
|
|
||||||
model[word] = new_weights
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def compile(
|
def compile(
|
||||||
|
@ -85,8 +27,49 @@ def compile(
|
||||||
A compiled GPTC model.
|
A compiled GPTC model.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
word_counts, category_lengths, names = _count_words(
|
|
||||||
raw_model, max_ngram_length, hash_algorithm
|
word_counts: Dict[int, Dict[int, int]] = {}
|
||||||
)
|
category_lengths: Dict[int, int] = {}
|
||||||
model = _get_weights(min_count, word_counts, category_lengths, names)
|
names: List[str] = []
|
||||||
|
|
||||||
|
for portion in raw_model:
|
||||||
|
text = gptc.tokenizer.hash(
|
||||||
|
gptc.tokenizer.tokenize(portion["text"], max_ngram_length),
|
||||||
|
hash_algorithm,
|
||||||
|
)
|
||||||
|
category_name = portion["category"]
|
||||||
|
|
||||||
|
if not category_name in names:
|
||||||
|
names.append(category_name)
|
||||||
|
|
||||||
|
category = names.index(category_name)
|
||||||
|
|
||||||
|
category_lengths[category] = category_lengths.get(category, 0) + len(
|
||||||
|
text
|
||||||
|
)
|
||||||
|
|
||||||
|
for word in text:
|
||||||
|
if word in word_counts:
|
||||||
|
try:
|
||||||
|
word_counts[word][category] += 1
|
||||||
|
except KeyError:
|
||||||
|
word_counts[word][category] = 1
|
||||||
|
else:
|
||||||
|
word_counts[word] = {category: 1}
|
||||||
|
|
||||||
|
model: Dict[int, List[int]] = {}
|
||||||
|
for word, counts in word_counts.items():
|
||||||
|
if sum(counts.values()) >= min_count:
|
||||||
|
weights = {
|
||||||
|
category: value / category_lengths[category]
|
||||||
|
for category, value in counts.items()
|
||||||
|
}
|
||||||
|
total = sum(weights.values())
|
||||||
|
new_weights: List[int] = []
|
||||||
|
for category in range(len(names)):
|
||||||
|
new_weights.append(
|
||||||
|
round((weights.get(category, 0) / total) * 65535)
|
||||||
|
)
|
||||||
|
model[word] = new_weights
|
||||||
|
|
||||||
return gptc.model.Model(model, names, max_ngram_length, hash_algorithm)
|
return gptc.model.Model(model, names, max_ngram_length, hash_algorithm)
|
||||||
|
|
16
profiler.py
16
profiler.py
|
@ -1,16 +0,0 @@
|
||||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
||||||
|
|
||||||
import cProfile
|
|
||||||
import gptc
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
max_ngram_length = 10
|
|
||||||
|
|
||||||
with open("models/raw.json") as f:
|
|
||||||
raw_model = json.load(f)
|
|
||||||
|
|
||||||
with open("models/benchmark_text.txt") as f:
|
|
||||||
text = f.read()
|
|
||||||
|
|
||||||
cProfile.run("gptc.compile(raw_model, max_ngram_length)")
|
|
Loading…
Reference in New Issue
Block a user