gptc/gptc/compiler.py
Samuel Sloniker af1d1749d2
Refactor word count dict in compiler
This makes future changes to the algorithm much simpler.
2022-11-23 11:33:40 -08:00

88 lines
2.3 KiB
Python
Executable File

# SPDX-License-Identifier: GPL-3.0-or-later
import gptc.tokenizer
from typing import Iterable, Mapping, List, Dict, Union
WEIGHTS_T = List[int]
CONFIG_T = Union[List[str], int, str]
MODEL = Dict[str, Union[WEIGHTS_T, CONFIG_T]]
def compile(
raw_model: Iterable[Mapping[str, str]], max_ngram_length: int = 1
) -> MODEL:
"""Compile a raw model.
Parameters
----------
raw_model : list of dict
A raw GPTC model.
max_ngram_length : int
Maximum ngram lenght to compile with.
Returns
-------
dict
A compiled GPTC model.
"""
categories: Dict[str, List[str]] = {}
for portion in raw_model:
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
category = portion["category"]
try:
categories[category] += text
except KeyError:
categories[category] = text
word_counts: Dict[str, Dict[str, float]] = {}
names = []
for category, text in categories.items():
if not category in names:
names.append(category)
for word in text:
try:
counts_for_word = word_counts[word]
except KeyError:
counts_for_word = {}
word_counts[word] = counts_for_word
try:
word_counts[word][category] += 1
except KeyError:
word_counts[word][category] = 1
word_weights: Dict[str, Dict[str, float]] = {}
for word, values in word_counts.items():
for category, value in values.items():
try:
word_weights[word][category] = value / len(categories[category])
except KeyError:
word_weights[word] = {
category: value / len(categories[category])
}
model: MODEL = {}
for word, weights in word_weights.items():
total = sum(weights.values())
new_weights: List[int] = []
for category in names:
new_weights.append(
round((weights.get(category, 0) / total) * 65535)
)
model[word] = new_weights
model["__names__"] = names
model["__ngrams__"] = max_ngram_length
model["__version__"] = 3
model["__emoji__"] = int(gptc.tokenizer.has_emoji)
return model