Compare commits
No commits in common. "48639f5d8d6562de3258adda6a2ab8c4d39dfaa3" and "62c3c27ddd59dd348b13c9efa005f912a6ba3275" have entirely different histories.
48639f5d8d
...
62c3c27ddd
12
README.md
12
README.md
|
@ -4,12 +4,6 @@ General-purpose text classifier in Python
|
||||||
|
|
||||||
GPTC provides both a CLI tool and a Python library.
|
GPTC provides both a CLI tool and a Python library.
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
pip install gptc[emoji] # handles emojis! (see section "Emoji")
|
|
||||||
# Or, if you don't need emoji support,
|
|
||||||
pip install gptc # no dependencies!
|
|
||||||
|
|
||||||
## CLI Tool
|
## CLI Tool
|
||||||
|
|
||||||
### Classifying text
|
### Classifying text
|
||||||
|
@ -78,12 +72,6 @@ reduced to the one used when compiling the model.
|
||||||
Models compiled with older versions of GPTC which did not support ngrams are
|
Models compiled with older versions of GPTC which did not support ngrams are
|
||||||
handled the same way as models compiled with `max_ngram_length=1`.
|
handled the same way as models compiled with `max_ngram_length=1`.
|
||||||
|
|
||||||
## Emoji
|
|
||||||
|
|
||||||
If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC
|
|
||||||
will automatically handle emojis the same way as words. If it is not installed,
|
|
||||||
GPTC will still work but will ignore emojis.
|
|
||||||
|
|
||||||
## Model format
|
## Model format
|
||||||
|
|
||||||
This section explains the raw model format, which is how you should create and
|
This section explains the raw model format, which is how you should create and
|
||||||
|
|
|
@ -33,9 +33,7 @@ print(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
classifier = gptc.Classifier(
|
classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
|
||||||
gptc.compile(raw_model, max_ngram_length), max_ngram_length
|
|
||||||
)
|
|
||||||
print(
|
print(
|
||||||
"Average classification time over",
|
"Average classification time over",
|
||||||
classify_iterations,
|
classify_iterations,
|
||||||
|
|
|
@ -6,34 +6,19 @@ import json
|
||||||
import sys
|
import sys
|
||||||
import gptc
|
import gptc
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="General Purpose Text Classifier", prog="gptc"
|
description="General Purpose Text Classifier", prog="gptc"
|
||||||
)
|
)
|
||||||
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
|
subparsers = parser.add_subparsers(dest="subparser_name", required=True)
|
||||||
|
|
||||||
compile_parser = subparsers.add_parser(
|
compile_parser = subparsers.add_parser("compile", help="compile a raw model")
|
||||||
"compile", help="compile a raw model"
|
|
||||||
)
|
|
||||||
compile_parser.add_argument("model", help="raw model to compile")
|
compile_parser.add_argument("model", help="raw model to compile")
|
||||||
compile_parser.add_argument(
|
compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
|
||||||
"--max-ngram-length",
|
|
||||||
"-n",
|
|
||||||
help="maximum ngram length",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
classify_parser = subparsers.add_parser("classify", help="classify text")
|
classify_parser = subparsers.add_parser("classify", help="classify text")
|
||||||
classify_parser.add_argument("model", help="compiled model to use")
|
classify_parser.add_argument("model", help="compiled model to use")
|
||||||
classify_parser.add_argument(
|
classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
|
||||||
"--max-ngram-length",
|
|
||||||
"-n",
|
|
||||||
help="maximum ngram length",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
)
|
|
||||||
group = classify_parser.add_mutually_exclusive_group()
|
group = classify_parser.add_mutually_exclusive_group()
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"-j",
|
"-j",
|
||||||
|
|
|
@ -1,12 +1,9 @@
|
||||||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||||
|
|
||||||
import gptc.tokenizer
|
import gptc.tokenizer
|
||||||
from typing import Iterable, Mapping, List, Dict, Union
|
|
||||||
|
|
||||||
|
|
||||||
def compile(
|
def compile(raw_model, max_ngram_length=1):
|
||||||
raw_model: Iterable[Mapping[str, str]], max_ngram_length: int = 1
|
|
||||||
) -> Dict[str, Union[int, List[Union[str, int]]]]:
|
|
||||||
"""Compile a raw model.
|
"""Compile a raw model.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
|
@ -24,7 +21,7 @@ def compile(
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
categories: Dict[str, str] = {}
|
categories = {}
|
||||||
|
|
||||||
for portion in raw_model:
|
for portion in raw_model:
|
||||||
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
|
||||||
|
@ -34,7 +31,7 @@ def compile(
|
||||||
except KeyError:
|
except KeyError:
|
||||||
categories[category] = text
|
categories[category] = text
|
||||||
|
|
||||||
categories_by_count: Dict[str, Dict[str, float]] = {}
|
categories_by_count = {}
|
||||||
|
|
||||||
names = []
|
names = []
|
||||||
|
|
||||||
|
@ -52,7 +49,7 @@ def compile(
|
||||||
categories_by_count[category][word] = 1 / len(
|
categories_by_count[category][word] = 1 / len(
|
||||||
categories[category]
|
categories[category]
|
||||||
)
|
)
|
||||||
word_weights: Dict[str, Dict[str, float]] = {}
|
word_weights = {}
|
||||||
for category, words in categories_by_count.items():
|
for category, words in categories_by_count.items():
|
||||||
for word, value in words.items():
|
for word, value in words.items():
|
||||||
try:
|
try:
|
||||||
|
@ -60,7 +57,7 @@ def compile(
|
||||||
except KeyError:
|
except KeyError:
|
||||||
word_weights[word] = {category: value}
|
word_weights[word] = {category: value}
|
||||||
|
|
||||||
model: Dict[str, Union[int, List[Union[str, int]]]] = {}
|
model = {}
|
||||||
for word, weights in word_weights.items():
|
for word, weights in word_weights.items():
|
||||||
total = sum(weights.values())
|
total = sum(weights.values())
|
||||||
model[word] = []
|
model[word] = []
|
||||||
|
|
|
@ -1,35 +1,13 @@
|
||||||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||||
|
|
||||||
try:
|
|
||||||
import emoji
|
|
||||||
|
|
||||||
has_emoji = True
|
|
||||||
except ImportError:
|
|
||||||
has_emoji = False
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text, max_ngram_length=1):
|
def tokenize(text, max_ngram_length=1):
|
||||||
"""Convert a string to a list of lemmas."""
|
"""Convert a string to a list of lemmas."""
|
||||||
text = text.lower()
|
|
||||||
|
|
||||||
if has_emoji:
|
|
||||||
parts = []
|
|
||||||
highest_end = 0
|
|
||||||
for emoji_part in emoji.emoji_list(text):
|
|
||||||
parts += list(text[highest_end : emoji_part["match_start"]])
|
|
||||||
parts.append(emoji_part["emoji"])
|
|
||||||
highest_end = emoji_part["match_end"]
|
|
||||||
parts += list(text[highest_end:])
|
|
||||||
text = [part for part in parts if part]
|
|
||||||
|
|
||||||
tokens = [""]
|
tokens = [""]
|
||||||
|
|
||||||
for char in text:
|
for char in text.lower():
|
||||||
if char.isalpha() or char == "'":
|
if char.isalpha() or char == "'":
|
||||||
tokens[-1] += char
|
tokens[-1] += char
|
||||||
elif has_emoji and emoji.is_emoji(char):
|
|
||||||
tokens.append(char)
|
|
||||||
tokens.append("")
|
|
||||||
elif tokens[-1] != "":
|
elif tokens[-1] != "":
|
||||||
tokens.append("")
|
tokens.append("")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user