Separate tokenization and hashing

This commit is contained in:
Samuel Sloniker 2022-11-26 17:04:56 -08:00
parent 30287288f2
commit fc4665bb9e
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62
3 changed files with 13 additions and 7 deletions

View File

@ -30,7 +30,9 @@ def compile(
categories: Dict[str, List[int]] = {}
for portion in raw_model:
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
text = gptc.tokenizer.hash(
gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
)
category = portion["category"]
try:
categories[category] += text

View File

@ -39,8 +39,10 @@ class Model:
model = self.weights
tokens = gptc.tokenizer.tokenize(
text, min(max_ngram_length, self.max_ngram_length)
tokens = gptc.tokenizer.hash(
gptc.tokenizer.tokenize(
text, min(max_ngram_length, self.max_ngram_length)
)
)
numbered_probs: Dict[int, float] = {}
for word in tokens:

View File

@ -2,11 +2,10 @@
from typing import List, Union
import hashlib
import base64
import emoji
def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
text = text.lower()
parts = []
highest_end = 0
@ -31,16 +30,19 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
tokens = [string for string in tokens if string]
if max_ngram_length == 1:
ngrams = tokens
return tokens
else:
ngrams = []
for ngram_length in range(1, max_ngram_length + 1):
for index in range(len(tokens) + 1 - ngram_length):
ngrams.append(" ".join(tokens[index : index + ngram_length]))
return ngrams
def hash(tokens: List[str]) -> List[int]:
return [
int.from_bytes(
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
)
for token in ngrams
for token in tokens
]