You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
53 lines
1.4 KiB
53 lines
1.4 KiB
import gptc.tokenizer |
|
import json |
|
|
|
|
|
def assemble_text(category_1, category_2, raw_model): |
|
category_1_words = [] |
|
category_2_words = [] |
|
|
|
for text_dict in raw_model: |
|
if text_dict["category"] == category_1: |
|
words = category_1_words |
|
elif text_dict["category"] == category_2: |
|
words = category_2_words |
|
else: |
|
continue |
|
|
|
words += gptc.tokenizer.tokenize(text_dict["text"]) |
|
|
|
category_1_words.append("") |
|
category_2_words.append("") |
|
|
|
return "\n".join(category_1_words), "\n".join(category_2_words) |
|
|
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"cat_1_name", help="the name of category 1 in the model" |
|
) |
|
parser.add_argument( |
|
"cat_2_name", help="the name of category 2 in the model" |
|
) |
|
parser.add_argument("model_path", help="path to raw model in JSON format") |
|
parser.add_argument( |
|
"cat_1_file", help="path to file to write category 1 words to" |
|
) |
|
parser.add_argument( |
|
"cat_2_file", help="path to file to write category 2 words to" |
|
) |
|
args = parser.parse_args() |
|
|
|
with open(args.model_path) as f: |
|
raw_model = json.load(f) |
|
|
|
cat1, cat2 = assemble_text(args.cat_1_name, args.cat_2_name, raw_model) |
|
|
|
with open(args.cat_1_file, "w+") as f: |
|
f.write(cat1) |
|
|
|
with open(args.cat_2_file, "w+") as f: |
|
f.write(cat2)
|
|
|