From 99ad07a876a375aa41cc8deef7f6a0300c4c88a6 Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Sun, 16 Apr 2023 14:49:03 -0700 Subject: [PATCH] Casefold Closes #14 --- gptc/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index 337779d..a24adf7 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -7,7 +7,7 @@ import unicodedata def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: - text = unicodedata.normalize("NFKD", text).lower() + text = unicodedata.normalize("NFKD", text).casefold() parts = [] highest_end = 0 for emoji_part in emoji.emoji_list(text):