Non-working type checks

Format code with black
Document emojis
2022-07-17 16:51:19 -07:00 · 2022-07-17 16:28:04 -07:00 · 2022-07-17 16:27:16 -07:00 · 2022-07-17 16:14:02 -07:00
5 changed files with 64 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -4,6 +4,12 @@ General-purpose text classifier in Python
 GPTC provides both a CLI tool and a Python library.
 ## Installation
    pip install gptc[emoji] # handles emojis! (see section "Emoji")
    # Or, if you don't need emoji support,
    pip install gptc # no dependencies!
 ## CLI Tool
 ### Classifying text
@ -72,6 +78,12 @@ reduced to the one used when compiling the model.
 Models compiled with older versions of GPTC which did not support ngrams are
 handled the same way as models compiled with `max_ngram_length=1`.
 ## Emoji
 If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC
 will automatically handle emojis the same way as words. If it is not installed,
 GPTC will still work but will ignore emojis.
 ## Model format
 This section explains the raw model format, which is how you should create and
--- a/benchmark.py
+++ b/benchmark.py
@ -33,7 +33,9 @@ print(
 )
-classifier = gptc.Classifier(gptc.compile(raw_model, max_ngram_length), max_ngram_length)
+classifier = gptc.Classifier(
    gptc.compile(raw_model, max_ngram_length), max_ngram_length
 )
 print(
    "Average classification time over",
    classify_iterations,
--- a/gptc/main.py
+++ b/gptc/main.py
@ -6,19 +6,34 @@ import json
 import sys
 import gptc
 def main():
    parser = argparse.ArgumentParser(
        description="General Purpose Text Classifier", prog="gptc"
    )
    subparsers = parser.add_subparsers(dest="subparser_name", required=True)
-    compile_parser = subparsers.add_parser("compile", help="compile a raw model")
+    compile_parser = subparsers.add_parser(
        "compile", help="compile a raw model"
    )
    compile_parser.add_argument("model", help="raw model to compile")
-    compile_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
+    compile_parser.add_argument(
        "--max-ngram-length",
        "-n",
        help="maximum ngram length",
        type=int,
        default=1,
    )
    classify_parser = subparsers.add_parser("classify", help="classify text")
    classify_parser.add_argument("model", help="compiled model to use")
-    classify_parser.add_argument("--max-ngram-length", "-n", help="maximum ngram length", type=int, default=1)
+    classify_parser.add_argument(
        "--max-ngram-length",
        "-n",
        help="maximum ngram length",
        type=int,
        default=1,
    )
    group = classify_parser.add_mutually_exclusive_group()
    group.add_argument(
        "-j",
--- a/gptc/compiler.py
+++ b/gptc/compiler.py
@ -1,9 +1,12 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import gptc.tokenizer
 from typing import Iterable, Mapping, List, Dict, Union
-def compile(raw_model, max_ngram_length=1):
+def compile(
    raw_model: Iterable[Mapping[str, str]], max_ngram_length: int = 1
 ) -> Dict[str, Union[int, List[Union[str, int]]]]:
    """Compile a raw model.
    Parameters
@ -21,7 +24,7 @@ def compile(raw_model, max_ngram_length=1):
    """
-    categories = {}
+    categories: Dict[str, str] = {}
    for portion in raw_model:
        text = gptc.tokenizer.tokenize(portion["text"], max_ngram_length)
@ -31,7 +34,7 @@ def compile(raw_model, max_ngram_length=1):
        except KeyError:
            categories[category] = text
-    categories_by_count = {}
+    categories_by_count: Dict[str, Dict[str, float]] = {}
    names = []
@ -49,7 +52,7 @@ def compile(raw_model, max_ngram_length=1):
                categories_by_count[category][word] = 1 / len(
                    categories[category]
                )
-    word_weights = {}
+    word_weights: Dict[str, Dict[str, float]] = {}
    for category, words in categories_by_count.items():
        for word, value in words.items():
            try:
@ -57,7 +60,7 @@ def compile(raw_model, max_ngram_length=1):
            except KeyError:
                word_weights[word] = {category: value}
-    model = {}
+    model: Dict[str, Union[int, List[Union[str, int]]]] = {}
    for word, weights in word_weights.items():
        total = sum(weights.values())
        model[word] = []
--- a/gptc/tokenizer.py
+++ b/gptc/tokenizer.py
@ -1,13 +1,35 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 try:
    import emoji
    has_emoji = True
 except ImportError:
    has_emoji = False
 def tokenize(text, max_ngram_length=1):
    """Convert a string to a list of lemmas."""
    text = text.lower()
    if has_emoji:
        parts = []
        highest_end = 0
        for emoji_part in emoji.emoji_list(text):
            parts += list(text[highest_end : emoji_part["match_start"]])
            parts.append(emoji_part["emoji"])
            highest_end = emoji_part["match_end"]
        parts += list(text[highest_end:])
        text = [part for part in parts if part]
    tokens = [""]
-    for char in text.lower():
+    for char in text:
        if char.isalpha() or char == "'":
            tokens[-1] += char
        elif has_emoji and emoji.is_emoji(char):
            tokens.append(char)
            tokens.append("")
        elif tokens[-1] != "":
            tokens.append("")
Author	SHA1	Message	Date
Samuel Sloniker	48639f5d8d	Non-working type checks	2022-07-17 16:51:19 -07:00
Samuel Sloniker	a207e281e7	Format code with black	2022-07-17 16:28:04 -07:00
Samuel Sloniker	e272ab42d1	Document emojis	2022-07-17 16:27:16 -07:00
Samuel Sloniker	bd0028a108	Add emoji support to tokenizer	2022-07-17 16:14:02 -07:00