diff --git a/README.md b/README.md index cd701d4..391c648 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,13 @@ stdout (or "None" if it cannot determine anything). This will print the compiled model in JSON to stdout. +### Packing models + + gptc pack + +This will print the raw model in JSON to stdout. See `models/unpacked/` for an +example of the format. Any exceptions will be printed to stderr. + ## Library ### `gptc.Classifier(model, max_ngram_length=1)` @@ -52,12 +59,44 @@ category:probability, ...}` Classify `text`. Returns the category into which the text is placed (as a string), or `None` when it cannot classify the text. +#### `Classifier.model` + +The classifier's model. + +#### `Classifier.has_emoji` + +Check whether emojis are supported by the `Classifier`. (See section "Emoji.") +Equivalent to `gptc.has_emoji and gptc.model_has_emoji(model)`. + ### `gptc.compile(raw_model, max_ngram_length=1)` + Compile a raw model (as a list, not JSON) and return the compiled model (as a dict). For information about `max_ngram_length`, see section "Ngrams." +### `gptc.pack(directory, print_exceptions=False) + +Pack the model in `directory` and return a tuple of the format: + + (raw_model, [(exception,),(exception,)...]) + +Note that the exceptions are contained in single-item tuples. This is to allow +more information to be provided without breaking the API in future versions of +GPTC. + +See `models/unpacked/` for an example of the format. + +### `gptc.has_emoji` + +`True` if the `emoji` package is installed (see section "Emoji"), `False` +otherwise. + +### `gptc.model_has_emoji(compiled_model)` + +Returns `True` if `compiled_model` was compiled with emoji support, `False` +otherwise. + ## Ngrams GPTC optionally supports using ngrams to improve classification accuracy. They @@ -84,6 +123,10 @@ If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC will automatically handle emojis the same way as words. If it is not installed, GPTC will still work but will ignore emojis. +`emoji` must be installed on both the system used to compile the model and the +system used to classify text. Emojis are ignored if it is missing on either +system. + ## Model format This section explains the raw model format, which is how you should create and diff --git a/gptc/__init__.py b/gptc/__init__.py index 6ef26b3..ac1a794 100644 --- a/gptc/__init__.py +++ b/gptc/__init__.py @@ -5,6 +5,8 @@ from gptc.compiler import compile as compile from gptc.classifier import Classifier as Classifier from gptc.pack import pack as pack +from gptc.tokenizer import has_emoji as has_emoji +from gptc.model_info import model_has_emoji as model_has_emoji from gptc.exceptions import ( GPTCError as GPTCError, ModelError as ModelError, diff --git a/gptc/classifier.py b/gptc/classifier.py index d4cd1dc..22de86c 100755 --- a/gptc/classifier.py +++ b/gptc/classifier.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: LGPL-3.0-or-later -import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting +import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting, gptc.model_info import warnings from typing import Dict, Union, cast, List @@ -33,6 +33,7 @@ class Classifier: self.model = model model_ngrams = cast(int, model.get("__ngrams__", 1)) self.max_ngram_length = min(max_ngram_length, model_ngrams) + self.has_emoji = gptc.tokenizer.has_emoji and gptc.model_info.model_has_emoji(model) def confidence(self, text: str) -> Dict[str, float]: """Classify text with confidence. diff --git a/gptc/model_info.py b/gptc/model_info.py new file mode 100755 index 0000000..be9d3b1 --- /dev/null +++ b/gptc/model_info.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later + +import gptc.compiler +from typing import Dict, Union, cast, List + + +def model_has_emoji(model: gptc.compiler.MODEL) -> bool: + return cast(int, model.get("__emoji__]", 0)) == 1