From 10668691ead42c4e2d5e9ff96a5056453f741022 Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Sat, 24 Dec 2022 10:46:40 -0800 Subject: [PATCH] Normalize characters Closes #3 --- gptc/tokenizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index f9251ee..33a2744 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -3,10 +3,11 @@ from typing import List, Union import hashlib import emoji +import unicodedata def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: - text = text.lower() + text = unicodedata.normalize("NFKD", text).lower() parts = [] highest_end = 0 for emoji_part in emoji.emoji_list(text):