From fc4665bb9e640d36a67553e3826a52632a4a6e98 Mon Sep 17 00:00:00 2001
From: Samuel Sloniker <sam@kj7rrv.com>
Date: Sat, 26 Nov 2022 17:04:56 -0800
Subject: [PATCH] Separate tokenization and hashing

---
 gptc/compiler.py  |  4 +++-
 gptc/model.py     |  6 ++++--
 gptc/tokenizer.py | 10 ++++++----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/gptc/compiler.py b/gptc/compiler.py
index 1415e4c..f994897 100755
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@@ -30,7 +30,9 @@ def compile(
     categories: Dict[str, List[int]] = {}
 
     for portion in raw_model:
-        text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
+        text = gptc.tokenizer.hash(
+            gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
+        )
         category = portion["category"]
         try:
             categories[category] += text
diff --git a/gptc/model.py b/gptc/model.py
index e105674..e1772ab 100644
--- a/gptc/model.py
+++ b/gptc/model.py
@@ -39,8 +39,10 @@ class Model:
 
         model = self.weights
 
-        tokens = gptc.tokenizer.tokenize(
-            text, min(max_ngram_length, self.max_ngram_length)
+        tokens = gptc.tokenizer.hash(
+            gptc.tokenizer.tokenize(
+                text, min(max_ngram_length, self.max_ngram_length)
+            )
         )
         numbered_probs: Dict[int, float] = {}
         for word in tokens:
diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py
index 67499db..bd5cd6d 100644
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@@ -2,11 +2,10 @@
 
 from typing import List, Union
 import hashlib
-import base64
 import emoji
 
 
-def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
+def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
     text = text.lower()
     parts = []
     highest_end = 0
@@ -31,16 +30,19 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
     tokens = [string for string in tokens if string]
 
     if max_ngram_length == 1:
-        ngrams = tokens
+        return tokens
     else:
         ngrams = []
         for ngram_length in range(1, max_ngram_length + 1):
             for index in range(len(tokens) + 1 - ngram_length):
                 ngrams.append(" ".join(tokens[index : index + ngram_length]))
+        return ngrams
 
+
+def hash(tokens: List[str]) -> List[int]:
     return [
         int.from_bytes(
             hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
         )
-        for token in ngrams
+        for token in tokens
     ]