A GPTC model to classify American news as right- or left-leaning
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

42 lines
1.3 KiB

#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import sqlite3
import tomli
import gptc
with open("compiler.toml", "rb") as f:
config = tomli.load(f)
con = sqlite3.connect("articles.db")
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
raw_model = [
{"text": i[0], "category": i[1]}
for i in con.execute("SELECT text, category FROM articles;")
]
with open("model.gptc", "w+b") as f:
gptc.compile(
raw_model,
max_ngram_length=config["max_ngram_length"],
min_count=config["min_count"],
).serialize(f)
con.commit()
con.close()