Hash words and ngrams
This commit is contained in:
parent
1d1ccbb7cc
commit
f4ae5f851d
|
@ -1,6 +1,8 @@
|
||||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
import hashlib
|
||||||
|
import base64
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import emoji
|
import emoji
|
||||||
|
@ -41,10 +43,16 @@ def tokenize(
|
||||||
tokens = [string for string in tokens if string]
|
tokens = [string for string in tokens if string]
|
||||||
|
|
||||||
if max_ngram_length == 1:
|
if max_ngram_length == 1:
|
||||||
return tokens
|
ngrams = tokens
|
||||||
else:
|
else:
|
||||||
ngrams = []
|
ngrams = []
|
||||||
for ngram_length in range(1, max_ngram_length + 1):
|
for ngram_length in range(1, max_ngram_length + 1):
|
||||||
for index in range(len(tokens) + 1 - ngram_length):
|
for index in range(len(tokens) + 1 - ngram_length):
|
||||||
ngrams.append(" ".join(tokens[index : index + ngram_length]))
|
ngrams.append(" ".join(tokens[index : index + ngram_length]))
|
||||||
return ngrams
|
|
||||||
|
return [
|
||||||
|
base64.b64encode(
|
||||||
|
hashlib.sha256(token.encode("utf-8")).digest()[:6]
|
||||||
|
).decode("ascii")
|
||||||
|
for token in ngrams
|
||||||
|
]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user