parent
74b2ba81b9
commit
295a1189de
|
@ -19,7 +19,12 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
|||
tokens = [""]
|
||||
|
||||
for char in converted_text:
|
||||
if char.isalpha() or char == "'":
|
||||
if (
|
||||
char.isalpha()
|
||||
or char.isnumeric()
|
||||
or char == "'"
|
||||
or (char in ",." and (" " + tokens[-1])[-1].isnumeric())
|
||||
):
|
||||
tokens[-1] += char
|
||||
elif emoji.is_emoji(char):
|
||||
tokens.append(char)
|
||||
|
|
Loading…
Reference in New Issue
Block a user