Update project, add model and converter
This commit is contained in:
parent
9ed9b82bf7
commit
fc03c8c866
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
__pycache__
|
53
convert_raw_model.py
Normal file
53
convert_raw_model.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
import gptc.tokenizer
|
||||
import json
|
||||
|
||||
|
||||
def assemble_text(category_1, category_2, raw_model):
|
||||
category_1_words = []
|
||||
category_2_words = []
|
||||
|
||||
for text_dict in raw_model:
|
||||
if text_dict["category"] == category_1:
|
||||
words = category_1_words
|
||||
elif text_dict["category"] == category_2:
|
||||
words = category_2_words
|
||||
else:
|
||||
continue
|
||||
|
||||
words += gptc.tokenizer.tokenize(text_dict["text"])
|
||||
|
||||
category_1_words.append("")
|
||||
category_2_words.append("")
|
||||
|
||||
return "\n".join(category_1_words), "\n".join(category_2_words)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"cat_1_name", help="the name of category 1 in the model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"cat_2_name", help="the name of category 2 in the model"
|
||||
)
|
||||
parser.add_argument("model_path", help="path to raw model in JSON format")
|
||||
parser.add_argument(
|
||||
"cat_1_file", help="path to file to write category 1 words to"
|
||||
)
|
||||
parser.add_argument(
|
||||
"cat_2_file", help="path to file to write category 2 words to"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.model_path) as f:
|
||||
raw_model = json.load(f)
|
||||
|
||||
cat1, cat2 = assemble_text(args.cat_1_name, args.cat_2_name, raw_model)
|
||||
|
||||
with open(args.cat_1_file, "w+") as f:
|
||||
f.write(cat1)
|
||||
|
||||
with open(args.cat_2_file, "w+") as f:
|
||||
f.write(cat2)
|
11
convert_text.py
Normal file
11
convert_text.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
import gptc.tokenizer
|
||||
|
||||
words = []
|
||||
|
||||
while True:
|
||||
try:
|
||||
words += gptc.tokenizer.tokenize(input())
|
||||
except EOFError:
|
||||
break
|
||||
|
||||
print("\n".join(words))
|
BIN
gptc_scratch.sb3
BIN
gptc_scratch.sb3
Binary file not shown.
5235
model/compiled/All words.txt
Normal file
5235
model/compiled/All words.txt
Normal file
File diff suppressed because it is too large
Load Diff
5235
model/compiled/Word scores in Category 1.txt
Normal file
5235
model/compiled/Word scores in Category 1.txt
Normal file
File diff suppressed because it is too large
Load Diff
5235
model/compiled/Word scores in Category 2.txt
Normal file
5235
model/compiled/Word scores in Category 2.txt
Normal file
File diff suppressed because it is too large
Load Diff
15624
model/raw/category_1_words_twain.txt
Normal file
15624
model/raw/category_1_words_twain.txt
Normal file
File diff suppressed because it is too large
Load Diff
15561
model/raw/category_2_words_shakespeare.txt
Normal file
15561
model/raw/category_2_words_shakespeare.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user