Compare commits
4 Commits
ff8cba84c7
...
185692790f
Author | SHA1 | Date | |
---|---|---|---|
185692790f | |||
73b800d60d | |||
b61ad35ae7 | |||
dc6eb48625 |
49
README.md
49
README.md
|
@ -14,7 +14,7 @@ GPTC provides both a CLI tool and a Python library.
|
|||
|
||||
### Classifying text
|
||||
|
||||
python -m gptc classify [-n <max_ngram_length>] <compiled model file>
|
||||
gptc classify [-n <max_ngram_length>] <compiled model file>
|
||||
|
||||
This will prompt for a string and classify it, then print (in JSON) a dict of
|
||||
the format `{category: probability, category:probability, ...}` to stdout. (For
|
||||
|
@ -22,17 +22,24 @@ information about `-n <max_ngram_length>`, see section "Ngrams.")
|
|||
|
||||
Alternatively, if you only need the most likely category, you can use this:
|
||||
|
||||
python -m gptc classify [-n <max_ngram_length>] <-c|--category> <compiled model file>
|
||||
gptc classify [-n <max_ngram_length>] <-c|--category> <compiled model file>
|
||||
|
||||
This will prompt for a string and classify it, outputting the category on
|
||||
stdout (or "None" if it cannot determine anything).
|
||||
|
||||
### Compiling models
|
||||
|
||||
python -m gptc compile [-n <max_ngram_length>] <raw model file>
|
||||
gptc compile [-n <max_ngram_length>] <raw model file>
|
||||
|
||||
This will print the compiled model in JSON to stdout.
|
||||
|
||||
### Packing models
|
||||
|
||||
gptc pack <dir>
|
||||
|
||||
This will print the raw model in JSON to stdout. See `models/unpacked/` for an
|
||||
example of the format. Any exceptions will be printed to stderr.
|
||||
|
||||
## Library
|
||||
|
||||
### `gptc.Classifier(model, max_ngram_length=1)`
|
||||
|
@ -52,12 +59,44 @@ category:probability, ...}`
|
|||
Classify `text`. Returns the category into which the text is placed (as a
|
||||
string), or `None` when it cannot classify the text.
|
||||
|
||||
#### `Classifier.model`
|
||||
|
||||
The classifier's model.
|
||||
|
||||
#### `Classifier.has_emoji`
|
||||
|
||||
Check whether emojis are supported by the `Classifier`. (See section "Emoji.")
|
||||
Equivalent to `gptc.has_emoji and gptc.model_has_emoji(model)`.
|
||||
|
||||
### `gptc.compile(raw_model, max_ngram_length=1)`
|
||||
|
||||
Compile a raw model (as a list, not JSON) and return the compiled model (as a
|
||||
dict).
|
||||
|
||||
For information about `max_ngram_length`, see section "Ngrams."
|
||||
|
||||
### `gptc.pack(directory, print_exceptions=False)
|
||||
|
||||
Pack the model in `directory` and return a tuple of the format:
|
||||
|
||||
(raw_model, [(exception,),(exception,)...])
|
||||
|
||||
Note that the exceptions are contained in single-item tuples. This is to allow
|
||||
more information to be provided without breaking the API in future versions of
|
||||
GPTC.
|
||||
|
||||
See `models/unpacked/` for an example of the format.
|
||||
|
||||
### `gptc.has_emoji`
|
||||
|
||||
`True` if the `emoji` package is installed (see section "Emoji"), `False`
|
||||
otherwise.
|
||||
|
||||
### `gptc.model_has_emoji(compiled_model)`
|
||||
|
||||
Returns `True` if `compiled_model` was compiled with emoji support, `False`
|
||||
otherwise.
|
||||
|
||||
## Ngrams
|
||||
|
||||
GPTC optionally supports using ngrams to improve classification accuracy. They
|
||||
|
@ -84,6 +123,10 @@ If the [`emoji`](https://pypi.org/project/emoji/) package is installed, GPTC
|
|||
will automatically handle emojis the same way as words. If it is not installed,
|
||||
GPTC will still work but will ignore emojis.
|
||||
|
||||
`emoji` must be installed on both the system used to compile the model and the
|
||||
system used to classify text. Emojis are ignored if it is missing on either
|
||||
system.
|
||||
|
||||
## Model format
|
||||
|
||||
This section explains the raw model format, which is how you should create and
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
from gptc.compiler import compile as compile
|
||||
from gptc.classifier import Classifier as Classifier
|
||||
from gptc.pack import pack as pack
|
||||
from gptc.tokenizer import has_emoji as has_emoji
|
||||
from gptc.model_info import model_has_emoji as model_has_emoji
|
||||
from gptc.exceptions import (
|
||||
GPTCError as GPTCError,
|
||||
ModelError as ModelError,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||
|
||||
import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting
|
||||
import gptc.tokenizer, gptc.compiler, gptc.exceptions, gptc.weighting, gptc.model_info
|
||||
import warnings
|
||||
from typing import Dict, Union, cast, List
|
||||
|
||||
|
@ -33,6 +33,7 @@ class Classifier:
|
|||
self.model = model
|
||||
model_ngrams = cast(int, model.get("__ngrams__", 1))
|
||||
self.max_ngram_length = min(max_ngram_length, model_ngrams)
|
||||
self.has_emoji = gptc.tokenizer.has_emoji and gptc.model_info.model_has_emoji(model)
|
||||
|
||||
def confidence(self, text: str) -> Dict[str, float]:
|
||||
"""Classify text with confidence.
|
||||
|
|
8
gptc/model_info.py
Executable file
8
gptc/model_info.py
Executable file
|
@ -0,0 +1,8 @@
|
|||
# SPDX-License-Identifier: LGPL-3.0-or-later
|
||||
|
||||
import gptc.compiler
|
||||
from typing import Dict, Union, cast, List
|
||||
|
||||
|
||||
def model_has_emoji(model: gptc.compiler.MODEL) -> bool:
|
||||
return cast(int, model.get("__emoji__]", 0)) == 1
|
|
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||
|
||||
[project]
|
||||
name = "gptc"
|
||||
version = "2.0.1"
|
||||
version = "2.1.0"
|
||||
description = "General-purpose text classifier"
|
||||
readme = "README.md"
|
||||
authors = [{ name = "Samuel Sloniker", email = "sam@kj7rrv.com"}]
|
||||
|
@ -18,6 +18,9 @@ classifiers = [
|
|||
dependencies = []
|
||||
requires-python = ">=3.7"
|
||||
|
||||
[project.optional-dependencies]
|
||||
emoji = ["emoji"]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://git.kj7rrv.com/kj7rrv/gptc"
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user