Hash words and ngrams

2022-11-23 12:53:01 -08:00 · 2022-11-23 12:53:01 -08:00 · f4ae5f851d
commit f4ae5f851d
parent 1d1ccbb7cc
1 changed files with 10 additions and 2 deletions
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@ -1,6 +1,8 @@
 # SPDX-License-Identifier: GPL-3.0-or-later

 from typing import List, Union
+import hashlib
+import base64

 try:
    import emoji
@ -41,10 +43,16 @@ def tokenize(
    tokens = [string for string in tokens if string]

    if max_ngram_length == 1:
-        return tokens
+        ngrams = tokens
    else:
        ngrams = []
        for ngram_length in range(1, max_ngram_length + 1):
            for index in range(len(tokens) + 1 - ngram_length):
                ngrams.append(" ".join(tokens[index : index + ngram_length]))
-        return ngrams
+
+    return [
+        base64.b64encode(
+            hashlib.sha256(token.encode("utf-8")).digest()[:6]
+        ).decode("ascii")
+        for token in ngrams
+    ]