From b3a43150d87ef274dfd3adcfbeff3cfefd6bdf4a Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Sat, 26 Nov 2022 17:42:42 -0800 Subject: [PATCH] Split hash function --- gptc/tokenizer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index 1d6ca10..9902291 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -39,13 +39,14 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: return ngrams +def hash_single(token: str) -> int: + return int.from_bytes( + hashlib.sha256(token.encode("utf-8")).digest()[:6], "big" + ) + + def hash(tokens: List[str]) -> List[int]: - return [ - int.from_bytes( - hashlib.sha256(token.encode("utf-8")).digest()[:6], "big" - ) - for token in tokens - ] + return [hash_single(token) for token in tokens] def normalize(text: str) -> str: