Hash words and ngrams

This commit is contained in:
Samuel Sloniker 2022-11-23 12:53:01 -08:00
parent 1d1ccbb7cc
commit f4ae5f851d
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: GPL-3.0-or-later
from typing import List, Union
import hashlib
import base64
try:
import emoji
@ -41,10 +43,16 @@ def tokenize(
tokens = [string for string in tokens if string]
if max_ngram_length == 1:
return tokens
ngrams = tokens
else:
ngrams = []
for ngram_length in range(1, max_ngram_length + 1):
for index in range(len(tokens) + 1 - ngram_length):
ngrams.append(" ".join(tokens[index : index + ngram_length]))
return ngrams
return [
base64.b64encode(
hashlib.sha256(token.encode("utf-8")).digest()[:6]
).decode("ascii")
for token in ngrams
]