Tokenizer and hasher

2023-06-15 12:00:35 -07:00 · 2023-06-15 12:00:35 -07:00 · a1a200dd58
commit a1a200dd58
parent 7d55dd94fe
2 changed files with 73 additions and 0 deletions
--- a/micronlp/hasher.py
+++ b/micronlp/hasher.py
@ -0,0 +1,14 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 from typing import Iterable, Iterator
 import hashlib
 def hash_single(token: str) -> int:
    return int.from_bytes(
        hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
    )
 def hash_list(tokens: Iterable[str]) -> Iterator[int]:
    return (hash_single(token) for token in tokens)
--- a/micronlp/tokenizer.py
+++ b/micronlp/tokenizer.py
@ -0,0 +1,59 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 import unicodedata
 from typing import List, Iterator, Iterable
 import emoji
 def _split_characters(text: str) -> Iterator[str]:
    highest_end = 0
    for emoji_part in emoji.emoji_list(text):
        for char in text[highest_end : emoji_part["match_start"]]:
            yield char
        yield emoji_part["emoji"]
        highest_end = emoji_part["match_end"]
    for char in text[highest_end:]:
        yield char
 def tokenize(
    text: str, include: str = "'", include_after_number: str = ",."
 ) -> Iterator[str]:
    last_token = ""
    for char in _split_characters(
        unicodedata.normalize("NFKD", text).casefold()
    ):
        if (
            char.isalpha()
            or char.isnumeric()
            or char == include
            or (
                char in include_after_number
                and (" " + last_token)[-1].isnumeric()
            )
        ):
            last_token += char
        elif emoji.is_emoji(char):
            yield char
            last_token = ""
        else:
            if last_token:
                yield last_token
            last_token = ""
    if last_token:
        yield last_token
 def ngrams(tokens: Iterable[str], max_ngram_length: int) -> List[str]:
    if max_ngram_length == 1:
        return tokens
    tokens_list = list(tokens)
    ngrams = []
    for ngram_length in range(1, max_ngram_length + 1):
        for index in range(len(tokens_list) + 1 - ngram_length):
            ngrams.append(" ".join(tokens_list[index : index + ngram_length]))
    return ngrams