parent
74b2ba81b9
commit
295a1189de
|
@ -19,7 +19,12 @@ def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
||||||
tokens = [""]
|
tokens = [""]
|
||||||
|
|
||||||
for char in converted_text:
|
for char in converted_text:
|
||||||
if char.isalpha() or char == "'":
|
if (
|
||||||
|
char.isalpha()
|
||||||
|
or char.isnumeric()
|
||||||
|
or char == "'"
|
||||||
|
or (char in ",." and (" " + tokens[-1])[-1].isnumeric())
|
||||||
|
):
|
||||||
tokens[-1] += char
|
tokens[-1] += char
|
||||||
elif emoji.is_emoji(char):
|
elif emoji.is_emoji(char):
|
||||||
tokens.append(char)
|
tokens.append(char)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user