From a1a200dd587023d4dc3d06714a3bb5b31fbd4ca6 Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Thu, 15 Jun 2023 12:00:35 -0700 Subject: [PATCH] Tokenizer and hasher --- micronlp/hasher.py | 14 ++++++++++ micronlp/tokenizer.py | 59 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 micronlp/hasher.py create mode 100644 micronlp/tokenizer.py diff --git a/micronlp/hasher.py b/micronlp/hasher.py new file mode 100644 index 0000000..ab38a6e --- /dev/null +++ b/micronlp/hasher.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +from typing import Iterable, Iterator +import hashlib + + +def hash_single(token: str) -> int: + return int.from_bytes( + hashlib.sha256(token.encode("utf-8")).digest()[:6], "big" + ) + + +def hash_list(tokens: Iterable[str]) -> Iterator[int]: + return (hash_single(token) for token in tokens) diff --git a/micronlp/tokenizer.py b/micronlp/tokenizer.py new file mode 100644 index 0000000..e373763 --- /dev/null +++ b/micronlp/tokenizer.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +import unicodedata +from typing import List, Iterator, Iterable +import emoji + + +def _split_characters(text: str) -> Iterator[str]: + highest_end = 0 + for emoji_part in emoji.emoji_list(text): + for char in text[highest_end : emoji_part["match_start"]]: + yield char + yield emoji_part["emoji"] + highest_end = emoji_part["match_end"] + for char in text[highest_end:]: + yield char + + +def tokenize( + text: str, include: str = "'", include_after_number: str = ",." +) -> Iterator[str]: + last_token = "" + + for char in _split_characters( + unicodedata.normalize("NFKD", text).casefold() + ): + if ( + char.isalpha() + or char.isnumeric() + or char == include + or ( + char in include_after_number + and (" " + last_token)[-1].isnumeric() + ) + ): + last_token += char + elif emoji.is_emoji(char): + yield char + last_token = "" + else: + if last_token: + yield last_token + last_token = "" + + if last_token: + yield last_token + + +def ngrams(tokens: Iterable[str], max_ngram_length: int) -> List[str]: + if max_ngram_length == 1: + return tokens + + tokens_list = list(tokens) + + ngrams = [] + for ngram_length in range(1, max_ngram_length + 1): + for index in range(len(tokens_list) + 1 - ngram_length): + ngrams.append(" ".join(tokens_list[index : index + ngram_length])) + return ngrams