diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index 337779d..a24adf7 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -7,7 +7,7 @@ import unicodedata def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: - text = unicodedata.normalize("NFKD", text).lower() + text = unicodedata.normalize("NFKD", text).casefold() parts = [] highest_end = 0 for emoji_part in emoji.emoji_list(text):