import gptc.tokenizer import json def assemble_text(category_1, category_2, raw_model): category_1_words = [] category_2_words = [] for text_dict in raw_model: if text_dict["category"] == category_1: words = category_1_words elif text_dict["category"] == category_2: words = category_2_words else: continue words += gptc.tokenizer.tokenize(text_dict["text"]) category_1_words.append("") category_2_words.append("") return "\n".join(category_1_words), "\n".join(category_2_words) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument( "cat_1_name", help="the name of category 1 in the model" ) parser.add_argument( "cat_2_name", help="the name of category 2 in the model" ) parser.add_argument("model_path", help="path to raw model in JSON format") parser.add_argument( "cat_1_file", help="path to file to write category 1 words to" ) parser.add_argument( "cat_2_file", help="path to file to write category 2 words to" ) args = parser.parse_args() with open(args.model_path) as f: raw_model = json.load(f) cat1, cat2 = assemble_text(args.cat_1_name, args.cat_2_name, raw_model) with open(args.cat_1_file, "w+") as f: f.write(cat1) with open(args.cat_2_file, "w+") as f: f.write(cat2)