Update project, add model and converter

2022-07-06 13:07:37 -07:00 · 2022-07-06 13:07:37 -07:00 · fc03c8c866
commit fc03c8c866
parent 9ed9b82bf7
9 changed files with 46955 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+__pycache__
--- a/convert_raw_model.py
+++ b/convert_raw_model.py
@ -0,0 +1,53 @@
+import gptc.tokenizer
+import json
+
+
+def assemble_text(category_1, category_2, raw_model):
+    category_1_words = []
+    category_2_words = []
+
+    for text_dict in raw_model:
+        if text_dict["category"] == category_1:
+            words = category_1_words
+        elif text_dict["category"] == category_2:
+            words = category_2_words
+        else:
+            continue
+
+        words += gptc.tokenizer.tokenize(text_dict["text"])
+
+    category_1_words.append("")
+    category_2_words.append("")
+
+    return "\n".join(category_1_words), "\n".join(category_2_words)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "cat_1_name", help="the name of category 1 in the model"
+    )
+    parser.add_argument(
+        "cat_2_name", help="the name of category 2 in the model"
+    )
+    parser.add_argument("model_path", help="path to raw model in JSON format")
+    parser.add_argument(
+        "cat_1_file", help="path to file to write category 1 words to"
+    )
+    parser.add_argument(
+        "cat_2_file", help="path to file to write category 2 words to"
+    )
+    args = parser.parse_args()
+
+    with open(args.model_path) as f:
+        raw_model = json.load(f)
+
+    cat1, cat2 = assemble_text(args.cat_1_name, args.cat_2_name, raw_model)
+
+    with open(args.cat_1_file, "w+") as f:
+        f.write(cat1)
+
+    with open(args.cat_2_file, "w+") as f:
+        f.write(cat2)
--- a/convert_text.py
+++ b/convert_text.py
@ -0,0 +1,11 @@
+import gptc.tokenizer
+
+words = []
+
+while True:
+    try:
+        words += gptc.tokenizer.tokenize(input())
+    except EOFError:
+        break
+
+print("\n".join(words))
--- a/gptc_scratch.sb3
+++ b/gptc_scratch.sb3
--- a/model/compiled/All
+++ b/model/compiled/All
--- a/model/compiled/Word
+++ b/model/compiled/Word
--- a/model/compiled/Word
+++ b/model/compiled/Word
--- a/model/raw/category_1_words_twain.txt
+++ b/model/raw/category_1_words_twain.txt
--- a/model/raw/category_2_words_shakespeare.txt
+++ b/model/raw/category_2_words_shakespeare.txt