Separate tokenization and hashing

2022-11-26 17:04:56 -08:00 · 2022-11-26 17:04:56 -08:00 · fc4665bb9e
commit fc4665bb9e
parent 30287288f2
3 changed files with 13 additions and 7 deletions
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@ -30,7 +30,9 @@ def compile(
    categories: Dict[str, List[int]] = {}

    for portion in raw_model:
-        text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
+        text = gptc.tokenizer.hash(
+            gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
+        )
        category = portion["category"]
        try:
            categories[category] += text
--- a/gptc/model.py
+++ b/gptc/model.py
@ -39,8 +39,10 @@ class Model:

        model = self.weights

-        tokens = gptc.tokenizer.tokenize(
-            text, min(max_ngram_length, self.max_ngram_length)
+        tokens = gptc.tokenizer.hash(
+            gptc.tokenizer.tokenize(
+                text, min(max_ngram_length, self.max_ngram_length)
+            )
        )
        numbered_probs: Dict[int, float] = {}
        for word in tokens:
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@ -2,11 +2,10 @@

 from typing import List, Union
 import hashlib
-import base64
 import emoji


-def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
+def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
    text = text.lower()
    parts = []
    highest_end = 0
@ -31,16 +30,19 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
    tokens = [string for string in tokens if string]

    if max_ngram_length == 1:
-        ngrams = tokens
+        return tokens
    else:
        ngrams = []
        for ngram_length in range(1, max_ngram_length + 1):
            for index in range(len(tokens) + 1 - ngram_length):
                ngrams.append(" ".join(tokens[index : index + ngram_length]))
+        return ngrams

+
+def hash(tokens: List[str]) -> List[int]:
    return [
        int.from_bytes(
            hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
        )
-        for token in ngrams
+        for token in tokens
    ]