From a1a200dd587023d4dc3d06714a3bb5b31fbd4ca6 Mon Sep 17 00:00:00 2001
From: Samuel Sloniker <sam@kj7rrv.com>
Date: Thu, 15 Jun 2023 12:00:35 -0700
Subject: [PATCH] Tokenizer and hasher

---
 micronlp/hasher.py    | 14 ++++++++++
 micronlp/tokenizer.py | 59 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 micronlp/hasher.py
 create mode 100644 micronlp/tokenizer.py

diff --git a/micronlp/hasher.py b/micronlp/hasher.py
new file mode 100644
index 0000000..ab38a6e
--- /dev/null
+++ b/micronlp/hasher.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+from typing import Iterable, Iterator
+import hashlib
+
+
+def hash_single(token: str) -> int:
+    return int.from_bytes(
+        hashlib.sha256(token.encode("utf-8")).digest()[:6], "big"
+    )
+
+
+def hash_list(tokens: Iterable[str]) -> Iterator[int]:
+    return (hash_single(token) for token in tokens)
diff --git a/micronlp/tokenizer.py b/micronlp/tokenizer.py
new file mode 100644
index 0000000..e373763
--- /dev/null
+++ b/micronlp/tokenizer.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import unicodedata
+from typing import List, Iterator, Iterable
+import emoji
+
+
+def _split_characters(text: str) -> Iterator[str]:
+    highest_end = 0
+    for emoji_part in emoji.emoji_list(text):
+        for char in text[highest_end : emoji_part["match_start"]]:
+            yield char
+        yield emoji_part["emoji"]
+        highest_end = emoji_part["match_end"]
+    for char in text[highest_end:]:
+        yield char
+
+
+def tokenize(
+    text: str, include: str = "'", include_after_number: str = ",."
+) -> Iterator[str]:
+    last_token = ""
+
+    for char in _split_characters(
+        unicodedata.normalize("NFKD", text).casefold()
+    ):
+        if (
+            char.isalpha()
+            or char.isnumeric()
+            or char == include
+            or (
+                char in include_after_number
+                and (" " + last_token)[-1].isnumeric()
+            )
+        ):
+            last_token += char
+        elif emoji.is_emoji(char):
+            yield char
+            last_token = ""
+        else:
+            if last_token:
+                yield last_token
+            last_token = ""
+
+    if last_token:
+        yield last_token
+
+
+def ngrams(tokens: Iterable[str], max_ngram_length: int) -> List[str]:
+    if max_ngram_length == 1:
+        return tokens
+    
+    tokens_list = list(tokens)
+
+    ngrams = []
+    for ngram_length in range(1, max_ngram_length + 1):
+        for index in range(len(tokens_list) + 1 - ngram_length):
+            ngrams.append(" ".join(tokens_list[index : index + ngram_length]))
+    return ngrams