Hash words and ngrams
This commit is contained in:
parent
1d1ccbb7cc
commit
f4ae5f851d
|
@ -1,6 +1,8 @@
|
|||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
from typing import List, Union
|
||||
import hashlib
|
||||
import base64
|
||||
|
||||
try:
|
||||
import emoji
|
||||
|
@ -41,10 +43,16 @@ def tokenize(
|
|||
tokens = [string for string in tokens if string]
|
||||
|
||||
if max_ngram_length == 1:
|
||||
return tokens
|
||||
ngrams = tokens
|
||||
else:
|
||||
ngrams = []
|
||||
for ngram_length in range(1, max_ngram_length + 1):
|
||||
for index in range(len(tokens) + 1 - ngram_length):
|
||||
ngrams.append(" ".join(tokens[index : index + ngram_length]))
|
||||
return ngrams
|
||||
|
||||
return [
|
||||
base64.b64encode(
|
||||
hashlib.sha256(token.encode("utf-8")).digest()[:6]
|
||||
).decode("ascii")
|
||||
for token in ngrams
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue
Block a user