Tokenizer and hasher

This commit is contained in:
Samuel Sloniker 2023-06-15 12:00:35 -07:00
parent 7d55dd94fe
commit a1a200dd58
2 changed files with 73 additions and 0 deletions

14
micronlp/hasher.py Normal file
View File

@ -0,0 +1,14 @@
# SPDX-License-Identifier: GPL-3.0-or-later
from typing import Iterable, Iterator
import hashlib
def hash_single(token: str) -> int:
return int.from_bytes(
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
)
def hash_list(tokens: Iterable[str]) -> Iterator[int]:
return (hash_single(token) for token in tokens)

59
micronlp/tokenizer.py Normal file
View File

@ -0,0 +1,59 @@
# SPDX-License-Identifier: GPL-3.0-or-later
import unicodedata
from typing import List, Iterator, Iterable
import emoji
def _split_characters(text: str) -> Iterator[str]:
highest_end = 0
for emoji_part in emoji.emoji_list(text):
for char in text[highest_end : emoji_part["match_start"]]:
yield char
yield emoji_part["emoji"]
highest_end = emoji_part["match_end"]
for char in text[highest_end:]:
yield char
def tokenize(
text: str, include: str = "'", include_after_number: str = ",."
) -> Iterator[str]:
last_token = ""
for char in _split_characters(
unicodedata.normalize("NFKD", text).casefold()
):
if (
char.isalpha()
or char.isnumeric()
or char == include
or (
char in include_after_number
and (" " + last_token)[-1].isnumeric()
)
):
last_token += char
elif emoji.is_emoji(char):
yield char
last_token = ""
else:
if last_token:
yield last_token
last_token = ""
if last_token:
yield last_token
def ngrams(tokens: Iterable[str], max_ngram_length: int) -> List[str]:
if max_ngram_length == 1:
return tokens
tokens_list = list(tokens)
ngrams = []
for ngram_length in range(1, max_ngram_length + 1):
for index in range(len(tokens_list) + 1 - ngram_length):
ngrams.append(" ".join(tokens_list[index : index + ngram_length]))
return ngrams