Split hash function

This commit is contained in:
Samuel Sloniker 2022-11-26 17:42:42 -08:00
parent 08437a2696
commit b3a43150d8
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62

View File

@ -39,13 +39,14 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
return ngrams
def hash_single(token: str) -> int:
return int.from_bytes(
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
)
def hash(tokens: List[str]) -> List[int]:
return [
int.from_bytes(
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
)
for token in tokens
]
return [hash_single(token) for token in tokens]
def normalize(text: str) -> str: