parent
295a1189de
commit
10668691ea
|
@ -3,10 +3,11 @@
|
|||
from typing import List, Union
|
||||
import hashlib
|
||||
import emoji
|
||||
import unicodedata
|
||||
|
||||
|
||||
def tokenize(text: str, max_ngram_length: int = 1) -> List[str]:
|
||||
text = text.lower()
|
||||
text = unicodedata.normalize("NFKD", text).lower()
|
||||
parts = []
|
||||
highest_end = 0
|
||||
for emoji_part in emoji.emoji_list(text):
|
||||
|
|
Loading…
Reference in New Issue
Block a user