You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
59 lines
1.6 KiB
59 lines
1.6 KiB
# SPDX-License-Identifier: GPL-3.0-or-later |
|
|
|
import unicodedata |
|
from typing import Iterator, Iterable |
|
import emoji |
|
|
|
|
|
def _split_characters(text: str) -> Iterator[str]: |
|
highest_end = 0 |
|
for emoji_part in emoji.emoji_list(text): |
|
for char in text[highest_end : emoji_part["match_start"]]: |
|
yield char |
|
yield emoji_part["emoji"] |
|
highest_end = emoji_part["match_end"] |
|
for char in text[highest_end:]: |
|
yield char |
|
|
|
|
|
def tokenize( |
|
text: str, include: str = "'", include_after_number: str = ",." |
|
) -> Iterator[str]: |
|
last_token = "" |
|
|
|
for char in _split_characters( |
|
unicodedata.normalize("NFKD", text).casefold() |
|
): |
|
if ( |
|
char.isalpha() |
|
or char.isnumeric() |
|
or char == include |
|
or ( |
|
char in include_after_number |
|
and (" " + last_token)[-1].isnumeric() |
|
) |
|
): |
|
last_token += char |
|
elif emoji.is_emoji(char): |
|
yield char |
|
last_token = "" |
|
else: |
|
if last_token: |
|
yield last_token |
|
last_token = "" |
|
|
|
if last_token: |
|
yield last_token |
|
|
|
|
|
def ngrams(tokens: Iterable[str], max_ngram_length: int) -> Iterable[str]: |
|
if max_ngram_length == 1: |
|
return tokens |
|
|
|
tokens_list = list(tokens) |
|
|
|
ngrams = [] |
|
for ngram_length in range(1, max_ngram_length + 1): |
|
for index in range(len(tokens_list) + 1 - ngram_length): |
|
ngrams.append(" ".join(tokens_list[index : index + ngram_length])) |
|
return ngrams
|
|
|