Tokenizer and hasher
This commit is contained in:
parent
7d55dd94fe
commit
a1a200dd58
14
micronlp/hasher.py
Normal file
14
micronlp/hasher.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
from typing import Iterable, Iterator
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
|
def hash_single(token: str) -> int:
|
||||||
|
return int.from_bytes(
|
||||||
|
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def hash_list(tokens: Iterable[str]) -> Iterator[int]:
|
||||||
|
return (hash_single(token) for token in tokens)
|
59
micronlp/tokenizer.py
Normal file
59
micronlp/tokenizer.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
import unicodedata
|
||||||
|
from typing import List, Iterator, Iterable
|
||||||
|
import emoji
|
||||||
|
|
||||||
|
|
||||||
|
def _split_characters(text: str) -> Iterator[str]:
|
||||||
|
highest_end = 0
|
||||||
|
for emoji_part in emoji.emoji_list(text):
|
||||||
|
for char in text[highest_end : emoji_part["match_start"]]:
|
||||||
|
yield char
|
||||||
|
yield emoji_part["emoji"]
|
||||||
|
highest_end = emoji_part["match_end"]
|
||||||
|
for char in text[highest_end:]:
|
||||||
|
yield char
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(
|
||||||
|
text: str, include: str = "'", include_after_number: str = ",."
|
||||||
|
) -> Iterator[str]:
|
||||||
|
last_token = ""
|
||||||
|
|
||||||
|
for char in _split_characters(
|
||||||
|
unicodedata.normalize("NFKD", text).casefold()
|
||||||
|
):
|
||||||
|
if (
|
||||||
|
char.isalpha()
|
||||||
|
or char.isnumeric()
|
||||||
|
or char == include
|
||||||
|
or (
|
||||||
|
char in include_after_number
|
||||||
|
and (" " + last_token)[-1].isnumeric()
|
||||||
|
)
|
||||||
|
):
|
||||||
|
last_token += char
|
||||||
|
elif emoji.is_emoji(char):
|
||||||
|
yield char
|
||||||
|
last_token = ""
|
||||||
|
else:
|
||||||
|
if last_token:
|
||||||
|
yield last_token
|
||||||
|
last_token = ""
|
||||||
|
|
||||||
|
if last_token:
|
||||||
|
yield last_token
|
||||||
|
|
||||||
|
|
||||||
|
def ngrams(tokens: Iterable[str], max_ngram_length: int) -> List[str]:
|
||||||
|
if max_ngram_length == 1:
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
tokens_list = list(tokens)
|
||||||
|
|
||||||
|
ngrams = []
|
||||||
|
for ngram_length in range(1, max_ngram_length + 1):
|
||||||
|
for index in range(len(tokens_list) + 1 - ngram_length):
|
||||||
|
ngrams.append(" ".join(tokens_list[index : index + ngram_length]))
|
||||||
|
return ngrams
|
Loading…
Reference in New Issue
Block a user