diff --git a/gptc/__init__.py b/gptc/__init__.py index b970c05..fd24dce 100644 --- a/gptc/__init__.py +++ b/gptc/__init__.py @@ -6,6 +6,7 @@ from gptc.compiler import compile as compile from gptc.classifier import Classifier as Classifier from gptc.pack import pack as pack from gptc.model import Model as Model, deserialize as deserialize +from gptc.tokenizer import normalize as normalize from gptc.exceptions import ( GPTCError as GPTCError, ModelError as ModelError, diff --git a/gptc/tokenizer.py b/gptc/tokenizer.py index bd5cd6d..1d6ca10 100644 --- a/gptc/tokenizer.py +++ b/gptc/tokenizer.py @@ -46,3 +46,7 @@ def hash(tokens: List[str]) -> List[int]: ) for token in tokens ] + + +def normalize(text: str) -> str: + return " ".join(tokenize(text, 1))