Compare commits

...

4 Commits

Author SHA1 Message Date
48639f5d8d Non-working type checks 2022-07-17 16:51:19 -07:00
a207e281e7 Format code with black 2022-07-17 16:28:04 -07:00
e272ab42d1 Document emojis 2022-07-17 16:27:16 -07:00
bd0028a108 Add emoji support to tokenizer 2022-07-17 16:14:02 -07:00
5 changed files with 64 additions and 10 deletions

View File

@ -4,6 +4,12 @@ General-purpose text classifier in Python
GPTC provides both a CLI tool and a Python library. GPTC provides both a CLI tool and a Python library.
## Installation
pip install gptc[emoji] # handles emojis! (see section "Emoji")
# Or, if you don't need emoji support,
pip install gptc # no dependencies!
## CLI Tool ## CLI Tool
### Classifying text ### Classifying text
@ -72,6 +78,12 @@ reduced to the one used when compiling the model.
Models compiled with older versions of GPTC which did not support ngrams are Models compiled with older versions of GPTC which did not support ngrams are
handled the same way as models compiled with `max_ngram_length=1`. handled the same way as models compiled with `max_ngram_length=1`.
## Emoji
If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC
will automatically handle emojis the same way as words. If it is not installed,
GPTC will still work but will ignore emojis.
## Model format ## Model format
This section explains the raw model format, which is how you should create and This section explains the raw model format, which is how you should create and

View File

@ -33,7 +33,9 @@ print(
) )
classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length) classifier = gptc.Classifier(
gptc.compile(raw_model, max_ngram_length), max_ngram_length
)
print( print(
"Average classification time over", "Average classification time over",
classify_iterations, classify_iterations,

View File

@ -6,19 +6,34 @@ import json
import sys import sys
import gptc import gptc
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="General Purpose Text Classifier", prog="gptc" description="General Purpose Text Classifier", prog="gptc"
) )
subparsers = parser.add_subparsers(dest="subparser_name", required=True) subparsers = parser.add_subparsers(dest="subparser_name", required=True)
compile_parser = subparsers.add_parser("compile", help="compile a raw model") compile_parser = subparsers.add_parser(
"compile", help="compile a raw model"
)
compile_parser.add_argument("model", help="raw model to compile") compile_parser.add_argument("model", help="raw model to compile")
compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1) compile_parser.add_argument(
"--max-ngram-length",
"-n",
help="maximum ngram length",
type=int,
default=1,
)
classify_parser = subparsers.add_parser("classify", help="classify text") classify_parser = subparsers.add_parser("classify", help="classify text")
classify_parser.add_argument("model", help="compiled model to use") classify_parser.add_argument("model", help="compiled model to use")
classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1) classify_parser.add_argument(
"--max-ngram-length",
"-n",
help="maximum ngram length",
type=int,
default=1,
)
group = classify_parser.add_mutually_exclusive_group() group = classify_parser.add_mutually_exclusive_group()
group.add_argument( group.add_argument(
"-j", "-j",

View File

@ -1,9 +1,12 @@
# SPDX-License-Identifier: LGPL-3.0-or-later # SPDX-License-Identifier: LGPL-3.0-or-later
import gptc.tokenizer import gptc.tokenizer
from typing import Iterable, Mapping, List, Dict, Union
def compile(raw_model, max_ngram_length=1): def compile(
raw_model: Iterable[Mapping[str, str]], max_ngram_length: int = 1
) -> Dict[str, Union[int, List[Union[str, int]]]]:
"""Compile a raw model. """Compile a raw model.
Parameters Parameters
@ -21,7 +24,7 @@ def compile(raw_model, max_ngram_length=1):
""" """
categories = {} categories: Dict[str, str] = {}
for portion in raw_model: for portion in raw_model:
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length) text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
@ -31,7 +34,7 @@ def compile(raw_model, max_ngram_length=1):
except KeyError: except KeyError:
categories[category] = text categories[category] = text
categories_by_count = {} categories_by_count: Dict[str, Dict[str, float]] = {}
names = [] names = []
@ -49,7 +52,7 @@ def compile(raw_model, max_ngram_length=1):
categories_by_count[category][word] = 1 / len( categories_by_count[category][word] = 1 / len(
categories[category] categories[category]
) )
word_weights = {} word_weights: Dict[str, Dict[str, float]] = {}
for category, words in categories_by_count.items(): for category, words in categories_by_count.items():
for word, value in words.items(): for word, value in words.items():
try: try:
@ -57,7 +60,7 @@ def compile(raw_model, max_ngram_length=1):
except KeyError: except KeyError:
word_weights[word] = {category: value} word_weights[word] = {category: value}
model = {} model: Dict[str, Union[int, List[Union[str, int]]]] = {}
for word, weights in word_weights.items(): for word, weights in word_weights.items():
total = sum(weights.values()) total = sum(weights.values())
model[word] = [] model[word] = []

View File

@ -1,13 +1,35 @@
# SPDX-License-Identifier: LGPL-3.0-or-later # SPDX-License-Identifier: LGPL-3.0-or-later
try:
import emoji
has_emoji = True
except ImportError:
has_emoji = False
def tokenize(text, max_ngram_length=1): def tokenize(text, max_ngram_length=1):
"""Convert a string to a list of lemmas.""" """Convert a string to a list of lemmas."""
text = text.lower()
if has_emoji:
parts = []
highest_end = 0
for emoji_part in emoji.emoji_list(text):
parts += list(text[highest_end : emoji_part["match_start"]])
parts.append(emoji_part["emoji"])
highest_end = emoji_part["match_end"]
parts += list(text[highest_end:])
text = [part for part in parts if part]
tokens = [""] tokens = [""]
for char in text.lower(): for char in text:
if char.isalpha() or char == "'": if char.isalpha() or char == "'":
tokens[-1] += char tokens[-1] += char
elif has_emoji and emoji.is_emoji(char):
tokens.append(char)
tokens.append("")
elif tokens[-1] != "": elif tokens[-1] != "":
tokens.append("") tokens.append("")