From 10668691ead42c4e2d5e9ff96a5056453f741022 Mon Sep 17 00:00:00 2001
From: Samuel Sloniker <sam@kj7rrv.com>
Date: Sat, 24 Dec 2022 10:46:40 -0800
Subject: [PATCH] Normalize characters

Closes #3
---
 gptc/tokenizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py
index f9251ee..33a2744 100644
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@@ -3,10 +3,11 @@
 from typing import List, Union
 import hashlib
 import emoji
+import unicodedata
 
 
 def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
-    text = text.lower()
+    text = unicodedata.normalize("NFKD", text).lower()
     parts = []
     highest_end = 0
     for emoji_part in emoji.emoji_list(text):