Separate tokenization and hashing

This commit is contained in:
Samuel Sloniker 2022-11-26 17:04:56 -08:00
parent 30287288f2
commit fc4665bb9e
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62
3 changed files with 13 additions and 7 deletions

View File

@ -30,7 +30,9 @@ def compile(
categories: Dict[str, List[int]] = {} categories: Dict[str, List[int]] = {}
for portion in raw_model: for portion in raw_model:
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length) text = gptc.tokenizer.hash(
gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
)
category = portion["category"] category = portion["category"]
try: try:
categories[category] += text categories[category] += text

View File

@ -39,8 +39,10 @@ class Model:
model = self.weights model = self.weights
tokens = gptc.tokenizer.tokenize( tokens = gptc.tokenizer.hash(
text, min(max_ngram_length, self.max_ngram_length) gptc.tokenizer.tokenize(
text, min(max_ngram_length, self.max_ngram_length)
)
) )
numbered_probs: Dict[int, float] = {} numbered_probs: Dict[int, float] = {}
for word in tokens: for word in tokens:

View File

@ -2,11 +2,10 @@
from typing import List, Union from typing import List, Union
import hashlib import hashlib
import base64
import emoji import emoji
def tokenize(text: str, max_ngram_length: int = 1) -> List[int]: def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
text = text.lower() text = text.lower()
parts = [] parts = []
highest_end = 0 highest_end = 0
@ -31,16 +30,19 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
tokens = [string for string in tokens if string] tokens = [string for string in tokens if string]
if max_ngram_length == 1: if max_ngram_length == 1:
ngrams = tokens return tokens
else: else:
ngrams = [] ngrams = []
for ngram_length in range(1, max_ngram_length + 1): for ngram_length in range(1, max_ngram_length + 1):
for index in range(len(tokens) + 1 - ngram_length): for index in range(len(tokens) + 1 - ngram_length):
ngrams.append(" ".join(tokens[index : index + ngram_length])) ngrams.append(" ".join(tokens[index : index + ngram_length]))
return ngrams
def hash(tokens: List[str]) -> List[int]:
return [ return [
int.from_bytes( int.from_bytes(
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big" hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
) )
for token in ngrams for token in tokens
] ]