Refactor word count dict in compiler
This makes future changes to the algorithm much simpler.
This commit is contained in:
parent
aea35ad059
commit
af1d1749d2
|
@ -38,7 +38,7 @@ def compile(
|
|||
except KeyError:
|
||||
categories[category] = text
|
||||
|
||||
categories_by_count: Dict[str, Dict[str, float]] = {}
|
||||
word_counts: Dict[str, Dict[str, float]] = {}
|
||||
|
||||
names = []
|
||||
|
||||
|
@ -46,23 +46,27 @@ def compile(
|
|||
if not category in names:
|
||||
names.append(category)
|
||||
|
||||
categories_by_count[category] = {}
|
||||
for word in text:
|
||||
try:
|
||||
categories_by_count[category][word] += 1 / len(
|
||||
categories[category]
|
||||
)
|
||||
counts_for_word = word_counts[word]
|
||||
except KeyError:
|
||||
categories_by_count[category][word] = 1 / len(
|
||||
categories[category]
|
||||
)
|
||||
word_weights: Dict[str, Dict[str, float]] = {}
|
||||
for category, words in categories_by_count.items():
|
||||
for word, value in words.items():
|
||||
counts_for_word = {}
|
||||
word_counts[word] = counts_for_word
|
||||
|
||||
try:
|
||||
word_weights[word][category] = value
|
||||
word_counts[word][category] += 1
|
||||
except KeyError:
|
||||
word_weights[word] = {category: value}
|
||||
word_counts[word][category] = 1
|
||||
|
||||
word_weights: Dict[str, Dict[str, float]] = {}
|
||||
for word, values in word_counts.items():
|
||||
for category, value in values.items():
|
||||
try:
|
||||
word_weights[word][category] = value / len(categories[category])
|
||||
except KeyError:
|
||||
word_weights[word] = {
|
||||
category: value / len(categories[category])
|
||||
}
|
||||
|
||||
model: MODEL = {}
|
||||
for word, weights in word_weights.items():
|
||||
|
@ -80,3 +84,4 @@ def compile(
|
|||
model["__emoji__"] = int(gptc.tokenizer.has_emoji)
|
||||
|
||||
return model
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user