You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
86 lines
2.3 KiB
86 lines
2.3 KiB
# SPDX-License-Identifier: GPL-3.0-or-later |
|
|
|
import unicodedata |
|
from typing import List, cast |
|
import hashlib |
|
import emoji |
|
|
|
|
|
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: |
|
text = unicodedata.normalize("NFKD", text).casefold() |
|
parts = [] |
|
highest_end = 0 |
|
for emoji_part in emoji.emoji_list(text): |
|
parts += list(text[highest_end : emoji_part["match_start"]]) |
|
parts.append(emoji_part["emoji"]) |
|
highest_end = emoji_part["match_end"] |
|
parts += list(text[highest_end:]) |
|
converted_text = [part for part in parts if part] |
|
|
|
tokens = [""] |
|
|
|
for char in converted_text: |
|
if ( |
|
char.isalpha() |
|
or char.isnumeric() |
|
or char == "'" |
|
or (char in ",." and (" " + tokens[-1])[-1].isnumeric()) |
|
): |
|
tokens[-1] += char |
|
elif emoji.is_emoji(char): |
|
tokens.append(char) |
|
tokens.append("") |
|
elif tokens[-1] != "": |
|
tokens.append("") |
|
|
|
tokens = [string for string in tokens if string] |
|
|
|
if max_ngram_length == 1: |
|
return tokens |
|
|
|
ngrams = [] |
|
for ngram_length in range(1, max_ngram_length + 1): |
|
for index in range(len(tokens) + 1 - ngram_length): |
|
ngrams.append(" ".join(tokens[index : index + ngram_length])) |
|
return ngrams |
|
|
|
|
|
def _hash_single(token: str, hash_function: type) -> int: |
|
return int.from_bytes( |
|
hash_function(token.encode("utf-8")).digest()[:6], "big" |
|
) |
|
|
|
|
|
def _get_hash_function(hash_algorithm: str) -> type: |
|
if hash_algorithm in { |
|
"sha224", |
|
"md5", |
|
"sha512", |
|
"sha3_256", |
|
"blake2s", |
|
"sha3_224", |
|
"sha1", |
|
"sha256", |
|
"sha384", |
|
"shake_256", |
|
"blake2b", |
|
"sha3_512", |
|
"shake_128", |
|
"sha3_384", |
|
}: |
|
return cast(type, getattr(hashlib, hash_algorithm)) |
|
|
|
raise ValueError("not a valid hash function: " + hash_algorithm) |
|
|
|
|
|
def hash_single(token: str, hash_algorithm: str) -> int: |
|
return _hash_single(token, _get_hash_function(hash_algorithm)) |
|
|
|
|
|
def hash_list(tokens: List[str], hash_algorithm: str) -> List[int]: |
|
hash_function = _get_hash_function(hash_algorithm) |
|
return [_hash_single(token, hash_function) for token in tokens] |
|
|
|
|
|
def normalize(text: str) -> str: |
|
return " ".join(tokenize(text, 1))
|
|
|