Tokenizer and hasher
This commit is contained in:
parent
7d55dd94fe
commit
a1a200dd58
14
micronlp/hasher.py
Normal file
14
micronlp/hasher.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
from typing import Iterable, Iterator
|
||||
import hashlib
|
||||
|
||||
|
||||
def hash_single(token: str) -> int:
|
||||
return int.from_bytes(
|
||||
hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
|
||||
)
|
||||
|
||||
|
||||
def hash_list(tokens: Iterable[str]) -> Iterator[int]:
|
||||
return (hash_single(token) for token in tokens)
|
59
micronlp/tokenizer.py
Normal file
59
micronlp/tokenizer.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
import unicodedata
|
||||
from typing import List, Iterator, Iterable
|
||||
import emoji
|
||||
|
||||
|
||||
def _split_characters(text: str) -> Iterator[str]:
|
||||
highest_end = 0
|
||||
for emoji_part in emoji.emoji_list(text):
|
||||
for char in text[highest_end : emoji_part["match_start"]]:
|
||||
yield char
|
||||
yield emoji_part["emoji"]
|
||||
highest_end = emoji_part["match_end"]
|
||||
for char in text[highest_end:]:
|
||||
yield char
|
||||
|
||||
|
||||
def tokenize(
|
||||
text: str, include: str = "'", include_after_number: str = ",."
|
||||
) -> Iterator[str]:
|
||||
last_token = ""
|
||||
|
||||
for char in _split_characters(
|
||||
unicodedata.normalize("NFKD", text).casefold()
|
||||
):
|
||||
if (
|
||||
char.isalpha()
|
||||
or char.isnumeric()
|
||||
or char == include
|
||||
or (
|
||||
char in include_after_number
|
||||
and (" " + last_token)[-1].isnumeric()
|
||||
)
|
||||
):
|
||||
last_token += char
|
||||
elif emoji.is_emoji(char):
|
||||
yield char
|
||||
last_token = ""
|
||||
else:
|
||||
if last_token:
|
||||
yield last_token
|
||||
last_token = ""
|
||||
|
||||
if last_token:
|
||||
yield last_token
|
||||
|
||||
|
||||
def ngrams(tokens: Iterable[str], max_ngram_length: int) -> List[str]:
|
||||
if max_ngram_length == 1:
|
||||
return tokens
|
||||
|
||||
tokens_list = list(tokens)
|
||||
|
||||
ngrams = []
|
||||
for ngram_length in range(1, max_ngram_length + 1):
|
||||
for index in range(len(tokens_list) + 1 - ngram_length):
|
||||
ngrams.append(" ".join(tokens_list[index : index + ngram_length]))
|
||||
return ngrams
|
Loading…
Reference in New Issue
Block a user