Refactor word count dict in compiler

This makes future changes to the algorithm much simpler.
2022-11-23 11:33:40 -08:00 · 2022-11-23 11:33:40 -08:00 · af1d1749d2
commit af1d1749d2
parent aea35ad059
1 changed files with 18 additions and 13 deletions
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@ -38,7 +38,7 @@ def compile(
        except KeyError:
            categories[category] = text

-    categories_by_count: Dict[str, Dict[str, float]] = {}
+    word_counts: Dict[str, Dict[str, float]] = {}

    names = []

@ -46,23 +46,27 @@ def compile(
        if not category in names:
            names.append(category)

-        categories_by_count[category] = {}
        for word in text:
            try:
-                categories_by_count[category][word] += 1 / len(
-                    categories[category]
-                )
+                counts_for_word = word_counts[word]
            except KeyError:
-                categories_by_count[category][word] = 1 / len(
-                    categories[category]
-                )
-    word_weights: Dict[str, Dict[str, float]] = {}
-    for category, words in categories_by_count.items():
-        for word, value in words.items():
+                counts_for_word = {}
+                word_counts[word] = counts_for_word
+
            try:
-                word_weights[word][category] = value
+                word_counts[word][category] += 1
            except KeyError:
-                word_weights[word] = {category: value}
+                word_counts[word][category] = 1
+
+    word_weights: Dict[str, Dict[str, float]] = {}
+    for word, values in word_counts.items():
+        for category, value in values.items():
+            try:
+                word_weights[word][category] = value / len(categories[category])
+            except KeyError:
+                word_weights[word] = {
+                    category: value / len(categories[category])
+                }

    model: MODEL = {}
    for word, weights in word_weights.items():
@ -80,3 +84,4 @@ def compile(
    model["__emoji__"] = int(gptc.tokenizer.has_emoji)

    return model
+