Samuel Sloniker
11 months ago
2 changed files with 73 additions and 0 deletions
@ -0,0 +1,14 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later |
||||
|
||||
from typing import Iterable, Iterator |
||||
import hashlib |
||||
|
||||
|
||||
def hash_single(token: str) -> int: |
||||
return int.from_bytes( |
||||
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big" |
||||
) |
||||
|
||||
|
||||
def hash_list(tokens: Iterable[str]) -> Iterator[int]: |
||||
return (hash_single(token) for token in tokens) |
@ -0,0 +1,59 @@
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later |
||||
|
||||
import unicodedata |
||||
from typing import List, Iterator, Iterable |
||||
import emoji |
||||
|
||||
|
||||
def _split_characters(text: str) -> Iterator[str]: |
||||
highest_end = 0 |
||||
for emoji_part in emoji.emoji_list(text): |
||||
for char in text[highest_end : emoji_part["match_start"]]: |
||||
yield char |
||||
yield emoji_part["emoji"] |
||||
highest_end = emoji_part["match_end"] |
||||
for char in text[highest_end:]: |
||||
yield char |
||||
|
||||
|
||||
def tokenize( |
||||
text: str, include: str = "'", include_after_number: str = ",." |
||||
) -> Iterator[str]: |
||||
last_token = "" |
||||
|
||||
for char in _split_characters( |
||||
unicodedata.normalize("NFKD", text).casefold() |
||||
): |
||||
if ( |
||||
char.isalpha() |
||||
or char.isnumeric() |
||||
or char == include |
||||
or ( |
||||
char in include_after_number |
||||
and (" " + last_token)[-1].isnumeric() |
||||
) |
||||
): |
||||
last_token += char |
||||
elif emoji.is_emoji(char): |
||||
yield char |
||||
last_token = "" |
||||
else: |
||||
if last_token: |
||||
yield last_token |
||||
last_token = "" |
||||
|
||||
if last_token: |
||||
yield last_token |
||||
|
||||
|
||||
def ngrams(tokens: Iterable[str], max_ngram_length: int) -> List[str]: |
||||
if max_ngram_length == 1: |
||||
return tokens |
||||
|
||||
tokens_list = list(tokens) |
||||
|
||||
ngrams = [] |
||||
for ngram_length in range(1, max_ngram_length + 1): |
||||
for index in range(len(tokens_list) + 1 - ngram_length): |
||||
ngrams.append(" ".join(tokens_list[index : index + ngram_length])) |
||||
return ngrams |
Loading…
Reference in new issue