Add emoji checks, improve docs

This commit is contained in:
Samuel Sloniker 2022-07-19 19:15:59 -07:00
parent 73b800d60d
commit 185692790f
4 changed files with 55 additions and 1 deletions

View File

@ -33,6 +33,13 @@ stdout (or "None" if it cannot determine anything).
This will print the compiled model in JSON to stdout. This will print the compiled model in JSON to stdout.
### Packing models
gptc pack <dir>
This will print the raw model in JSON to stdout. See `models/unpacked/` for an
example of the format. Any exceptions will be printed to stderr.
## Library ## Library
### `gptc.Classifier(model, max_ngram_length=1)` ### `gptc.Classifier(model, max_ngram_length=1)`
@ -52,12 +59,44 @@ category:probability, ...}`
Classify `text`. Returns the category into which the text is placed (as a Classify `text`. Returns the category into which the text is placed (as a
string), or `None` when it cannot classify the text. string), or `None` when it cannot classify the text.
#### `Classifier.model`
The classifier's model.
#### `Classifier.has_emoji`
Check whether emojis are supported by the `Classifier`. (See section "Emoji.")
Equivalent to `gptc.has_emoji and gptc.model_has_emoji(model)`.
### `gptc.compile(raw_model, max_ngram_length=1)` ### `gptc.compile(raw_model, max_ngram_length=1)`
Compile a raw model (as a list, not JSON) and return the compiled model (as a Compile a raw model (as a list, not JSON) and return the compiled model (as a
dict). dict).
For information about `max_ngram_length`, see section "Ngrams." For information about `max_ngram_length`, see section "Ngrams."
### `gptc.pack(directory, print_exceptions=False)
Pack the model in `directory` and return a tuple of the format:
(raw_model, [(exception,),(exception,)...])
Note that the exceptions are contained in single-item tuples. This is to allow
more information to be provided without breaking the API in future versions of
GPTC.
See `models/unpacked/` for an example of the format.
### `gptc.has_emoji`
`True` if the `emoji` package is installed (see section "Emoji"), `False`
otherwise.
### `gptc.model_has_emoji(compiled_model)`
Returns `True` if `compiled_model` was compiled with emoji support, `False`
otherwise.
## Ngrams ## Ngrams
GPTC optionally supports using ngrams to improve classification accuracy. They GPTC optionally supports using ngrams to improve classification accuracy. They
@ -84,6 +123,10 @@ If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC
will automatically handle emojis the same way as words. If it is not installed, will automatically handle emojis the same way as words. If it is not installed,
GPTC will still work but will ignore emojis. GPTC will still work but will ignore emojis.
`emoji` must be installed on both the system used to compile the model and the
system used to classify text. Emojis are ignored if it is missing on either
system.
## Model format ## Model format
This section explains the raw model format, which is how you should create and This section explains the raw model format, which is how you should create and

View File

@ -5,6 +5,8 @@
from gptc.compiler import compile as compile from gptc.compiler import compile as compile
from gptc.classifier import Classifier as Classifier from gptc.classifier import Classifier as Classifier
from gptc.pack import pack as pack from gptc.pack import pack as pack
from gptc.tokenizer import has_emoji as has_emoji
from gptc.model_info import model_has_emoji as model_has_emoji
from gptc.exceptions import ( from gptc.exceptions import (
GPTCError as GPTCError, GPTCError as GPTCError,
ModelError as ModelError, ModelError as ModelError,

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: LGPL-3.0-or-later # SPDX-License-Identifier: LGPL-3.0-or-later
import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting, gptc.model_info
import warnings import warnings
from typing import Dict, Union, cast, List from typing import Dict, Union, cast, List
@ -33,6 +33,7 @@ class Classifier:
self.model = model self.model = model
model_ngrams = cast(int, model.get("__ngrams__", 1)) model_ngrams = cast(int, model.get("__ngrams__", 1))
self.max_ngram_length = min(max_ngram_length, model_ngrams) self.max_ngram_length = min(max_ngram_length, model_ngrams)
self.has_emoji = gptc.tokenizer.has_emoji and gptc.model_info.model_has_emoji(model)
def confidence(self, text: str) -> Dict[str, float]: def confidence(self, text: str) -> Dict[str, float]:
"""Classify text with confidence. """Classify text with confidence.

8
gptc/model_info.py Executable file
View File

@ -0,0 +1,8 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
import gptc.compiler
from typing import Dict, Union, cast, List
def model_has_emoji(model: gptc.compiler.MODEL) -> bool:
return cast(int, model.get("__emoji__]", 0)) == 1