diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index 8915682..c2725f6 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-3.0-or-later from typing import List, Union +import hashlib +import base64 try: import emoji @@ -41,10 +43,16 @@ def tokenize( tokens = [string for string in tokens if string] if max_ngram_length == 1: - return tokens + ngrams = tokens else: ngrams = [] for ngram_length in range(1, max_ngram_length + 1): for index in range(len(tokens) + 1 - ngram_length): ngrams.append(" ".join(tokens[index : index + ngram_length])) - return ngrams + + return [ + base64.b64encode( + hashlib.sha256(token.encode("utf-8")).digest()[:6] + ).decode("ascii") + for token in ngrams + ]