diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index f9251ee..33a2744 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -3,10 +3,11 @@ from typing import List, Union import hashlib import emoji +import unicodedata def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: - text = text.lower() + text = unicodedata.normalize("NFKD", text).lower() parts = [] highest_end = 0 for emoji_part in emoji.emoji_list(text):