diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index 1d6ca10..9902291 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -39,13 +39,14 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: return ngrams +def hash_single(token: str) -> int: + return int.from_bytes( + hashlib.sha256(token.encode("utf-8")).digest()[:6], "big" + ) + + def hash(tokens: List[str]) -> List[int]: - return [ - int.from_bytes( - hashlib.sha256(token.encode("utf-8")).digest()[:6], "big" - ) - for token in tokens - ] + return [hash_single(token) for token in tokens] def normalize(text: str) -> str: