parent
295a1189de
commit
10668691ea
|
@ -3,10 +3,11 @@
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
import hashlib
|
import hashlib
|
||||||
import emoji
|
import emoji
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
||||||
text = text.lower()
|
text = unicodedata.normalize("NFKD", text).lower()
|
||||||
parts = []
|
parts = []
|
||||||
highest_end = 0
|
highest_end = 0
|
||||||
for emoji_part in emoji.emoji_list(text):
|
for emoji_part in emoji.emoji_list(text):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user