Refactor word count dict in compiler

This makes future changes to the algorithm much simpler.
This commit is contained in:
Samuel Sloniker 2022-11-23 11:33:40 -08:00
parent aea35ad059
commit af1d1749d2
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62

View File

@ -38,7 +38,7 @@ def compile(
except KeyError:
categories[category] = text
categories_by_count: Dict[str, Dict[str, float]] = {}
word_counts: Dict[str, Dict[str, float]] = {}
names = []
@ -46,23 +46,27 @@ def compile(
if not category in names:
names.append(category)
categories_by_count[category] = {}
for word in text:
try:
categories_by_count[category][word] += 1 / len(
categories[category]
)
counts_for_word = word_counts[word]
except KeyError:
categories_by_count[category][word] = 1 / len(
categories[category]
)
word_weights: Dict[str, Dict[str, float]] = {}
for category, words in categories_by_count.items():
for word, value in words.items():
counts_for_word = {}
word_counts[word] = counts_for_word
try:
word_weights[word][category] = value
word_counts[word][category] += 1
except KeyError:
word_weights[word] = {category: value}
word_counts[word][category] = 1
word_weights: Dict[str, Dict[str, float]] = {}
for word, values in word_counts.items():
for category, value in values.items():
try:
word_weights[word][category] = value / len(categories[category])
except KeyError:
word_weights[word] = {
category: value / len(categories[category])
}
model: MODEL = {}
for word, weights in word_weights.items():
@ -80,3 +84,4 @@ def compile(
model["__emoji__"] = int(gptc.tokenizer.has_emoji)
return model