Separate tokenization and hashing
This commit is contained in:
parent
30287288f2
commit
fc4665bb9e
|
@ -30,7 +30,9 @@ def compile(
|
||||||
categories: Dict[str, List[int]] = {}
|
categories: Dict[str, List[int]] = {}
|
||||||
|
|
||||||
for portion in raw_model:
|
for portion in raw_model:
|
||||||
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
text = gptc.tokenizer.hash(
|
||||||
|
gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
||||||
|
)
|
||||||
category = portion["category"]
|
category = portion["category"]
|
||||||
try:
|
try:
|
||||||
categories[category] += text
|
categories[category] += text
|
||||||
|
|
|
@ -39,8 +39,10 @@ class Model:
|
||||||
|
|
||||||
model = self.weights
|
model = self.weights
|
||||||
|
|
||||||
tokens = gptc.tokenizer.tokenize(
|
tokens = gptc.tokenizer.hash(
|
||||||
text, min(max_ngram_length, self.max_ngram_length)
|
gptc.tokenizer.tokenize(
|
||||||
|
text, min(max_ngram_length, self.max_ngram_length)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
numbered_probs: Dict[int, float] = {}
|
numbered_probs: Dict[int, float] = {}
|
||||||
for word in tokens:
|
for word in tokens:
|
||||||
|
|
|
@ -2,11 +2,10 @@
|
||||||
|
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
import hashlib
|
import hashlib
|
||||||
import base64
|
|
||||||
import emoji
|
import emoji
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
|
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
parts = []
|
parts = []
|
||||||
highest_end = 0
|
highest_end = 0
|
||||||
|
@ -31,16 +30,19 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[int]:
|
||||||
tokens = [string for string in tokens if string]
|
tokens = [string for string in tokens if string]
|
||||||
|
|
||||||
if max_ngram_length == 1:
|
if max_ngram_length == 1:
|
||||||
ngrams = tokens
|
return tokens
|
||||||
else:
|
else:
|
||||||
ngrams = []
|
ngrams = []
|
||||||
for ngram_length in range(1, max_ngram_length + 1):
|
for ngram_length in range(1, max_ngram_length + 1):
|
||||||
for index in range(len(tokens) + 1 - ngram_length):
|
for index in range(len(tokens) + 1 - ngram_length):
|
||||||
ngrams.append(" ".join(tokens[index : index + ngram_length]))
|
ngrams.append(" ".join(tokens[index : index + ngram_length]))
|
||||||
|
return ngrams
|
||||||
|
|
||||||
|
|
||||||
|
def hash(tokens: List[str]) -> List[int]:
|
||||||
return [
|
return [
|
||||||
int.from_bytes(
|
int.from_bytes(
|
||||||
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
|
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
|
||||||
)
|
)
|
||||||
for token in ngrams
|
for token in tokens
|
||||||
]
|
]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user