Separate tokenization and hashing
This commit is contained in:
parent
30287288f2
commit
fc4665bb9e
|
@ -30,7 +30,9 @@ def compile(
|
|||
categories: Dict[str, List[int]] = {}
|
||||
|
||||
for portion in raw_model:
|
||||
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
||||
text = gptc.tokenizer.hash(
|
||||
gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
||||
)
|
||||
category = portion["category"]
|
||||
try:
|
||||
categories[category] += text
|
||||
|
|
|
@ -39,8 +39,10 @@ class Model:
|
|||
|
||||
model = self.weights
|
||||
|
||||
tokens = gptc.tokenizer.tokenize(
|
||||
text, min(max_ngram_length, self.max_ngram_length)
|
||||
tokens = gptc.tokenizer.hash(
|
||||
gptc.tokenizer.tokenize(
|
||||
text, min(max_ngram_length, self.max_ngram_length)
|
||||
)
|
||||
)
|
||||
numbered_probs: Dict[int, float] = {}
|
||||
for word in tokens:
|
||||
|
|
|
@ -2,11 +2,10 @@
|
|||
|
||||
from typing import List, Union
|
||||
import hashlib
|
||||
import base64
|
||||
import emoji
|
||||
|
||||
|
||||
def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
|
||||
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
||||
text = text.lower()
|
||||
parts = []
|
||||
highest_end = 0
|
||||
|
@ -31,16 +30,19 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
|
|||
tokens = [string for string in tokens if string]
|
||||
|
||||
if max_ngram_length == 1:
|
||||
ngrams = tokens
|
||||
return tokens
|
||||
else:
|
||||
ngrams = []
|
||||
for ngram_length in range(1, max_ngram_length + 1):
|
||||
for index in range(len(tokens) + 1 - ngram_length):
|
||||
ngrams.append(" ".join(tokens[index : index + ngram_length]))
|
||||
return ngrams
|
||||
|
||||
|
||||
def hash(tokens: List[str]) -> List[int]:
|
||||
return [
|
||||
int.from_bytes(
|
||||
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
|
||||
)
|
||||
for token in ngrams
|
||||
for token in tokens
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue
Block a user