Split hash function
This commit is contained in:
parent
08437a2696
commit
b3a43150d8
|
@ -39,13 +39,14 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
||||||
return ngrams
|
return ngrams
|
||||||
|
|
||||||
|
|
||||||
|
def hash_single(token: str) -> int:
|
||||||
|
return int.from_bytes(
|
||||||
|
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def hash(tokens: List[str]) -> List[int]:
|
def hash(tokens: List[str]) -> List[int]:
|
||||||
return [
|
return [hash_single(token) for token in tokens]
|
||||||
int.from_bytes(
|
|
||||||
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
|
|
||||||
)
|
|
||||||
for token in tokens
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def normalize(text: str) -> str:
|
def normalize(text: str) -> str:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user