Include numbers in tokenized output

Closes #12
This commit is contained in:
Samuel Sloniker 2022-12-24 10:42:50 -08:00
parent 74b2ba81b9
commit 295a1189de
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62

View File

@ -19,7 +19,12 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
tokens = [""]
for char in converted_text:
if char.isalpha() or char == "'":
if (
char.isalpha()
or char.isnumeric()
or char == "'"
or (char in ",." and (" " + tokens[-1])[-1].isnumeric())
):
tokens[-1] += char
elif emoji.is_emoji(char):
tokens.append(char)