From af1d1749d2bd086e0012420b5ce4d6da6259fad4 Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Wed, 23 Nov 2022 11:33:40 -0800 Subject: [PATCH] Refactor word count dict in compiler This makes future changes to the algorithm much simpler. --- gptc/compiler.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/gptc/compiler.py b/gptc/compiler.py index 901b466..667f4e7 100755 --- a/gptc/compiler.py +++ b/gptc/compiler.py @@ -38,7 +38,7 @@ def compile( except KeyError: categories[category] = text - categories_by_count: Dict[str, Dict[str, float]] = {} + word_counts: Dict[str, Dict[str, float]] = {} names = [] @@ -46,23 +46,27 @@ def compile( if not category in names: names.append(category) - categories_by_count[category] = {} for word in text: try: - categories_by_count[category][word] += 1 / len( - categories[category] - ) + counts_for_word = word_counts[word] except KeyError: - categories_by_count[category][word] = 1 / len( - categories[category] - ) - word_weights: Dict[str, Dict[str, float]] = {} - for category, words in categories_by_count.items(): - for word, value in words.items(): + counts_for_word = {} + word_counts[word] = counts_for_word + try: - word_weights[word][category] = value + word_counts[word][category] += 1 except KeyError: - word_weights[word] = {category: value} + word_counts[word][category] = 1 + + word_weights: Dict[str, Dict[str, float]] = {} + for word, values in word_counts.items(): + for category, value in values.items(): + try: + word_weights[word][category] = value / len(categories[category]) + except KeyError: + word_weights[word] = { + category: value / len(categories[category]) + } model: MODEL = {} for word, weights in word_weights.items(): @@ -80,3 +84,4 @@ def compile( model["__emoji__"] = int(gptc.tokenizer.has_emoji) return model +