From fc4665bb9e640d36a67553e3826a52632a4a6e98 Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Sat, 26 Nov 2022 17:04:56 -0800 Subject: [PATCH] Separate tokenization and hashing --- gptc/compiler.py | 4 +++- gptc/model.py | 6 ++++-- gptc/tokenizer.py | 10 ++++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/gptc/compiler.py b/gptc/compiler.py index 1415e4c..f994897 100755 --- a/gptc/compiler.py +++ b/gptc/compiler.py @@ -30,7 +30,9 @@ def compile( categories: Dict[str, List[int]] = {} for portion in raw_model: - text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length) + text = gptc.tokenizer.hash( + gptc.tokenizer.tokenize(portion["text"], max_ngram_length) + ) category = portion["category"] try: categories[category] += text diff --git a/gptc/model.py b/gptc/model.py index e105674..e1772ab 100644 --- a/gptc/model.py +++ b/gptc/model.py @@ -39,8 +39,10 @@ class Model: model = self.weights - tokens = gptc.tokenizer.tokenize( - text, min(max_ngram_length, self.max_ngram_length) + tokens = gptc.tokenizer.hash( + gptc.tokenizer.tokenize( + text, min(max_ngram_length, self.max_ngram_length) + ) ) numbered_probs: Dict[int, float] = {} for word in tokens: diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index 67499db..bd5cd6d 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -2,11 +2,10 @@ from typing import List, Union import hashlib -import base64 import emoji -def tokenize(text: str, max_ngram_length: int = 1) -> List[int]: +def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: text = text.lower() parts = [] highest_end = 0 @@ -31,16 +30,19 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[int]: tokens = [string for string in tokens if string] if max_ngram_length == 1: - ngrams = tokens + return tokens else: ngrams = [] for ngram_length in range(1, max_ngram_length + 1): for index in range(len(tokens) + 1 - ngram_length): ngrams.append(" ".join(tokens[index : index + ngram_length])) + return ngrams + +def hash(tokens: List[str]) -> List[int]: return [ int.from_bytes( hashlib.sha256(token.encode("utf-8")).digest()[:6], "big" ) - for token in ngrams + for token in tokens ]