From 295a1189def5923412dc391cb4e016619e9df3e8 Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Sat, 24 Dec 2022 10:42:50 -0800 Subject: [PATCH] Include numbers in tokenized output Closes #12 --- gptc/tokenizer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index 9902291..f9251ee 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -19,7 +19,12 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[str]: tokens = [""] for char in converted_text: - if char.isalpha() or char == "'": + if ( + char.isalpha() + or char.isnumeric() + or char == "'" + or (char in ",." and (" " + tokens[-1])[-1].isnumeric()) + ): tokens[-1] += char elif emoji.is_emoji(char): tokens.append(char)