Compare commits

..

4 Commits

Author SHA1 Message Date
48639f5d8d Non-working type checks 2022-07-17 16:51:19 -07:00
a207e281e7 Format code with black 2022-07-17 16:28:04 -07:00
e272ab42d1 Document emojis 2022-07-17 16:27:16 -07:00
bd0028a108 Add emoji support to tokenizer 2022-07-17 16:14:02 -07:00
5 changed files with 64 additions and 10 deletions

View File

@ -4,6 +4,12 @@ General-purpose text classifier in Python
GPTC provides both a CLI tool and a Python library.
## Installation
pip install gptc[emoji] # handles emojis! (see section "Emoji")
# Or, if you don't need emoji support,
pip install gptc # no dependencies!
## CLI Tool
### Classifying text
@ -72,6 +78,12 @@ reduced to the one used when compiling the model.
Models compiled with older versions of GPTC which did not support ngrams are
handled the same way as models compiled with `max_ngram_length=1`.
## Emoji
If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC
will automatically handle emojis the same way as words. If it is not installed,
GPTC will still work but will ignore emojis.
## Model format
This section explains the raw model format, which is how you should create and

View File

@ -33,7 +33,9 @@ print(
)
classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
classifier = gptc.Classifier(
gptc.compile(raw_model, max_ngram_length), max_ngram_length
)
print(
"Average classification time over",
classify_iterations,

View File

@ -6,19 +6,34 @@ import json
import sys
import gptc
def main():
parser = argparse.ArgumentParser(
description="General Purpose Text Classifier", prog="gptc"
)
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
compile_parser = subparsers.add_parser("compile", help="compile a raw model")
compile_parser = subparsers.add_parser(
"compile", help="compile a raw model"
)
compile_parser.add_argument("model", help="raw model to compile")
compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
compile_parser.add_argument(
"--max-ngram-length",
"-n",
help="maximum ngram length",
type=int,
default=1,
)
classify_parser = subparsers.add_parser("classify", help="classify text")
classify_parser.add_argument("model", help="compiled model to use")
classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
classify_parser.add_argument(
"--max-ngram-length",
"-n",
help="maximum ngram length",
type=int,
default=1,
)
group = classify_parser.add_mutually_exclusive_group()
group.add_argument(
"-j",

View File

@ -1,9 +1,12 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
import gptc.tokenizer
from typing import Iterable, Mapping, List, Dict, Union
def compile(raw_model, max_ngram_length=1):
def compile(
raw_model: Iterable[Mapping[str, str]], max_ngram_length: int = 1
) -> Dict[str, Union[int, List[Union[str, int]]]]:
"""Compile a raw model.
Parameters
@ -21,7 +24,7 @@ def compile(raw_model, max_ngram_length=1):
"""
categories = {}
categories: Dict[str, str] = {}
for portion in raw_model:
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
@ -31,7 +34,7 @@ def compile(raw_model, max_ngram_length=1):
except KeyError:
categories[category] = text
categories_by_count = {}
categories_by_count: Dict[str, Dict[str, float]] = {}
names = []
@ -49,7 +52,7 @@ def compile(raw_model, max_ngram_length=1):
categories_by_count[category][word] = 1 / len(
categories[category]
)
word_weights = {}
word_weights: Dict[str, Dict[str, float]] = {}
for category, words in categories_by_count.items():
for word, value in words.items():
try:
@ -57,7 +60,7 @@ def compile(raw_model, max_ngram_length=1):
except KeyError:
word_weights[word] = {category: value}
model = {}
model: Dict[str, Union[int, List[Union[str, int]]]] = {}
for word, weights in word_weights.items():
total = sum(weights.values())
model[word] = []

View File

@ -1,13 +1,35 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
try:
import emoji
has_emoji = True
except ImportError:
has_emoji = False
def tokenize(text, max_ngram_length=1):
"""Convert a string to a list of lemmas."""
text = text.lower()
if has_emoji:
parts = []
highest_end = 0
for emoji_part in emoji.emoji_list(text):
parts += list(text[highest_end : emoji_part["match_start"]])
parts.append(emoji_part["emoji"])
highest_end = emoji_part["match_end"]
parts += list(text[highest_end:])
text = [part for part in parts if part]
tokens = [""]
for char in text.lower():
for char in text:
if char.isalpha() or char == "'":
tokens[-1] += char
elif has_emoji and emoji.is_emoji(char):
tokens.append(char)
tokens.append("")
elif tokens[-1] != "":
tokens.append("")