Lightweight NLP library in pure Python - currently implements a text classifier
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
# SPDX-License-Identifier: LGPL-3.0-or-later |
|
|
|
|
|
def tokenize(text): |
|
"""Convert a string to a list of lemmas.""" |
|
out = [""] |
|
|
|
for char in text.lower(): |
|
if char.isalpha() or char == "'": |
|
out[-1] += char |
|
elif out[-1] != "": |
|
out.append("") |
|
|
|
return [string for string in out if string]
|
|
|