11 changed files with 88 additions and 380 deletions
--- a/.gitignore
+++ b/.gitignore
@ -141,4 +141,3 @@ cython_debug/
 *.db
 *.db-journal
 *.gptc
-build/
--- a/README.md
+++ b/README.md
@ -3,19 +3,10 @@
 A [GPTC](https://git.kj7rrv.com/kj7rrv/gptc) model to classify American news as
 right- or left-leaning

-## Scripts
-
-No scripts take any arguments.
-
-* `./download.py`: download new articles and add them to the database
-* `./compile.py`: compile GPTC model
-* `./export.py`: create `build/` directory with files for release
-* `./stats.py`: print statistics on article and source counts
+Inclusion of a site in this model is not an endorsement of the site.

 ## Sources

-Inclusion of a site in this model is not an endorsement of the site.
-
 ### Left

 * ABC News
@ -23,26 +14,19 @@ Inclusion of a site in this model is not an endorsement of the site.
 * CBS News
 * CNBC
 * CNN
-* Democracy Now!
 * HuffPost (formerly Huffington Post)
-* The Intercept
 * Los Angeles Times
 * PBS NewsHour
-* Slate
-* The Washington Post

 ### Right

-* The American Conservative
 * American Thinker
 * Breitbart
 * Daily Caller
 * Epoch Times
 * The Federalist
 * Fox News
-* LifeSiteNews
-* New York Post
 * Not the Bee
+* New York Post
 * One America News Network
-* RedState
 * Washington Examiner
--- a/analyses/constitutional_amendments.py
+++ b/analyses/constitutional_amendments.py
@ -1,70 +0,0 @@
-import gptc
-
-amendments = [
-    ("1st", "First"),
-    ("2nd", "Second"),
-    ("3rd", "Third"),
-    ("4th", "Fourth"),
-    ("5th", "Fifth"),
-    ("6th", "Sixth"),
-    ("7th", "Seventh"),
-    ("8th", "Eighth"),
-    ("9th", "Ninth"),
-    ("10th", "Tenth"),
-    ("11th", "Eleventh"),
-    ("12th", "Twelfth"),
-    ("13th", "Thirteenth"),
-    ("14th", "Fourteenth"),
-    ("15th", "Fifteenth"),
-    ("16th", "Sixteenth"),
-    ("17th", "Seventeenth"),
-    ("18th", "Eighteenth"),
-    ("19th", "Nineteenth"),
-    ("20th", "Twentieth"),
-    ("21st", "Twenty-first"),
-    ("22nd", "Twenty-second"),
-    ("23rd", "Twenty-third"),
-    ("24th", "Twenty-fourth"),
-    ("25th", "Twenty-fifth"),
-    ("26th", "Twenty-sixth"),
-    ("27th", "Twenty-seventh"),
-]
-
-with open("model.gptc", "rb") as f:
-    model = gptc.deserialize(f)
-
-data = {}
-
-for number, name in amendments:
-    number_data = model.get(number + " Amendment")
-    name_data = model.get(name + " Amendment")
-
-    if number_data and not name_data:
-        data[name] = number_data
-    elif name_data and not number_data:
-        data[name] = name_data
-    elif number_data and name_data:
-        data[name] = {
-            key: (number_data[key] + name_data[key]) / 2
-            for key in number_data.keys()
-        }
-
-classified_amendments = sorted(data.items(), key=lambda x: x[1]["left"])
-
-print("# Constitutional Amendment Analysis")
-print()
-print("""This is an analysis of which amendments to the U.S. Constitution are mentioned
-more in right- or left-leaning American news sources. Data do not necessarily
-correlate with support or opposition for the amendment among right- or
-left-leaning Americans.""")
-print()
-print("| Amendment      | Left  | Right |")
-print("+----------------+-------+-------+")
-for amendment, data in classified_amendments:
-    percent_right = f"{data['right']*100:>4.1f}%"
-    percent_left = f"{data['left']*100:>4.1f}%"
-
-    amendment_padding = " "*(14 - len(amendment))
-    print(f"| {amendment}{amendment_padding} | {percent_left} | {percent_right} |")
-print("+----------------+-------+-------+")
-print("| Amendment      | Left  | Right |")
--- a/analyses/states.py
+++ b/analyses/states.py
@ -1,85 +0,0 @@
-import gptc
-
-states = [
-    "Alabama",
-    "Alaska",
-    "Arizona",
-    "Arkansas",
-    "California",
-    "Colorado",
-    "Connecticut",
-    "Delaware",
-    "Florida",
-    "Georgia",
-    "Hawaii",
-    "Idaho",
-    "Illinois",
-    "Indiana",
-    "Iowa",
-    "Kansas",
-    "Kentucky",
-    "Louisiana",
-    "Maine",
-    "Maryland",
-    "Massachusetts",
-    "Michigan",
-    "Minnesota",
-    "Mississippi",
-    "Missouri",
-    "Montana",
-    "Nebraska",
-    "Nevada",
-    "New Hampshire",
-    "New Jersey",
-    "New Mexico",
-    "New York",
-    "North Carolina",
-    "North Dakota",
-    "Ohio",
-    "Oklahoma",
-    "Oregon",
-    "Pennsylvania",
-    "Rhode Island",
-    "South Carolina",
-    "South Dakota",
-    "Tennessee",
-    "Texas",
-    "Utah",
-    "Vermont",
-    "Virginia",
-    "Washington",
-    "West Virginia",
-    "Wisconsin",
-    "Wyoming",
-]
-
-with open("model.gptc", "rb") as f:
-    model = gptc.deserialize(f)
-
-classified_states = []
-
-for state in states:
-    classified_states.append((state, model.get(state),))
-
-classified_states.sort(key=lambda x: x[1]["left"])
-
-longest = max([len(state) for state in states])
-
-print("# State Analysis")
-print()
-print("""This is an analysis of which states are mentioned more in right- or left-
-leaning American news sources. Results do not necessarily correlate with the
-political views of residents of the states; for example, the predominantly
-liberal state of Oregon is mentioned more in right-leaning sources than in
-left-leaning ones.""")
-print()
-print("| State          | Left  | Right |")
-print("+----------------+-------+-------+")
-for state, data in classified_states:
-    percent_right = f"{round(data['right']*1000)/10}%"
-    percent_left = f"{round(data['left']*1000)/10}%"
-
-    state_padding = " "*(longest - len(state))
-    print(f"| {state}{state_padding} | {percent_left} | {percent_right} |")
-print("+----------------+-------+-------+")
-print("| State          | Left  | Right |")
--- a/compile.py
+++ b/compile.py
@ -1,28 +1,6 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-# Copyright (c) 2022 Samuel L Sloniker
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU General Public License as published by the Free Software
-# Foundation, either version 3 of the License, or (at your option) any later
-# version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program. If not, see <https://www.gnu.org/licenses/>.
-
 import sqlite3
-import tomli
 import gptc

-with open("compiler.toml", "rb") as f:
-    config = tomli.load(f)
-
 con = sqlite3.connect("articles.db")
 con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")

@ -32,11 +10,9 @@ raw_model = [
 ]

 with open("model.gptc", "w+b") as f:
-    gptc.compile(
-        raw_model,
-        max_ngram_length=config["max_ngram_length"],
-        min_count=config["min_count"],
-    ).serialize(f)
+    f.write(
+        gptc.compile(raw_model, max_ngram_length=3, min_count=3).serialize()
+    )

 con.commit()
 con.close()
--- a/compiler.toml
+++ b/compiler.toml
@ -1,2 +0,0 @@
-max_ngram_length=5
-min_count=5
--- a/download.py
+++ b/download.py
@ -1,21 +1,3 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-# Copyright (c) 2022 Samuel L Sloniker
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU General Public License as published by the Free Software
-# Foundation, either version 3 of the License, or (at your option) any later
-# version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program. If not, see <https://www.gnu.org/licenses/>. 
-
 import feedparser
 import sqlite3
 import goose3
--- a/export.py
+++ b/export.py
@ -1,39 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-# Copyright (c) 2022 Samuel L Sloniker
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU General Public License as published by the Free Software
-# Foundation, either version 3 of the License, or (at your option) any later
-# version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program. If not, see <https://www.gnu.org/licenses/>. 
-
-import os
-import shutil
-import sqlite3
-
-try:
-    shutil.rmtree("build")
-except FileNotFoundError:
-    pass
-os.mkdir("build")
-
-shutil.copy("articles.db", "build/articles.db")
-shutil.copy("sources.toml", "build/sources.toml")
-shutil.copy("compiler.toml", "build/compiler.toml")
-shutil.copy("model.gptc", "build/model.gptc")
-
-con = sqlite3.Connection("build/articles.db")
-con.execute("UPDATE articles SET text = '***';")
-con.commit()
-con.execute("VACUUM;")
-con.commit()
-con.close()
--- a/export.sh
+++ b/export.sh
@ -0,0 +1,8 @@
+#!/bin/sh
+cp articles.db old_articles.db
+sqlite3 articles.db 'UPDATE articles SET text = "***";'
+sqlite3 articles.db 'VACUUM;'
+echo -n "Press enter when done..."
+read
+rm articles.db
+mv old_articles.db articles.db
--- a/sources.toml
+++ b/sources.toml
@ -5,11 +5,41 @@
 # ? Newsmax (read timeout errors)
 # ? Bloomberg (CAPTCHA on RSS feed?)

-[abc_news]
-feed="https://abcnews.go.com/abcnews/usheadlines"
+[cnn]
+feed="http://rss.cnn.com/rss/cnn_latest.rss"
 category="left"
 contains_articles=false
-name="ABC News"
+name="CNN"
+
+[huffpost]
+feed="https://chaski.huffpost.com/us/auto"
+category="left"
+contains_articles=false
+name="HuffPost"
+
+[cnbc]
+feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
+category="left"
+contains_articles=false
+name="CNBC"
+
+[pbs_newshour]
+feed="https://www.pbs.org/newshour/feeds/rss/headlines"
+category="left"
+contains_articles=false
+name="PBS NewsHour"
+
+[latimes]
+feed="https://www.latimes.com/local/rss2.0.xml"
+category="left"
+contains_articles=false
+name="Los Angeles Times"
+
+[cbs_news]
+feed="https://www.cbsnews.com/latest/rss/main"
+category="left"
+contains_articles=false
+name="CBS News"

 [atlantic]
 feed="https://www.theatlantic.com/feed/all/"
@ -18,35 +48,11 @@ contains_articles=true
 name="The Atlantic"
 sort="Atlantic"

-[cbs_news]
-feed="https://www.cbsnews.com/latest/rss/main"
+[abc_news]
+feed="https://abcnews.go.com/abcnews/usheadlines"
 category="left"
 contains_articles=false
-name="CBS News"
-
-[cnbc]
-feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
-category="left"
-contains_articles=false
-name="CNBC"
-
-[cnn]
-feed="http://rss.cnn.com/rss/cnn_latest.rss"
-category="left"
-contains_articles=false
-name="CNN"
-
-[democracy_now]
-feed="https://www.democracynow.org/democracynow.rss"
-category="left"
-contains_articles=false
-name="Democracy Now!"
-
-[huffpost]
-feed="https://chaski.huffpost.com/us/auto"
-category="left"
-contains_articles=false
-name="HuffPost"
+name="ABC News"

 [intercept]
 feed="https://theintercept.com/feed/?lang=en"
@ -55,42 +61,45 @@ contains_articles=true
 name="The Intercept"
 sort="Intercept"

-[latimes]
-feed="https://www.latimes.com/local/rss2.0.xml"
-category="left"
-contains_articles=false
-name="Los Angeles Times"
-
-[pbs_newshour]
-feed="https://www.pbs.org/newshour/feeds/rss/headlines"
-category="left"
-contains_articles=false
-name="PBS NewsHour"
-
-[slate]
-feed="http://www.slate.com/articles/news_and_politics.fulltext.all.10.rss"
-category="left"
-contains_articles=false
-name="Slate"
-
-[washington_post]
-feed="https://feeds.washingtonpost.com/rss/national"
-category="left"
-contains_articles=false
-name="The Washington Post"
-sort="Washington Post"
-
 #[bloomberg]
 #feed="https://www.bloomberg.com/politics/feeds/site.xml"
 #category="left"
 #contains_articles=false

-[american_conservative]
-feed="https://theamericanconservative.com/articles/feed/"
+[fox_news]
+feed="https://moxie.foxnews.com/google-publisher/latest.xml"
 category="right"
 contains_articles=true
-name="The American Conservative"
-sort="American Conservative"
+name="Fox News"
+
+[oann]
+feed="https://www.oann.com/category/newsroom/feed"
+category="right"
+contains_articles=true
+name="One America News Network"
+
+[nypost]
+feed="https://nypost.com/feed"
+category="right"
+contains_articles=false
+exclude=[
+    { type="startswith", pattern="https://pagesix.com" },
+    { type="startswith", pattern="https://decider.com" },
+]
+name="New York Post"
+
+[federalist]
+feed="https://thefederalist.com/feed"
+category="right"
+contains_articles=false
+name="The Federalist"
+sort="Federalist"
+
+[washington_examiner]
+feed="https://feeds.feedburner.com/dcexaminer/Politics"
+category="right"
+contains_articles=true
+name="Washington Examiner"

 [american_thinker]
 feed="https://feeds.feedburner.com/americanthinker_articles"
@ -104,66 +113,30 @@ category="right"
 contains_articles=false
 name="Breitbart"

-[daily_caller]
-feed="https://feeds.feedburner.com/dailycaller"
-category="right"
-contains_articles=false
-name="Daily Caller"
-
 [epoch_times]
 feed="https://www.theepochtimes.com/feed/"
 category="right"
 contains_articles=false
 name="Epoch Times"

-[federalist]
-feed="https://thefederalist.com/feed"
-category="right"
-contains_articles=false
-name="The Federalist"
-sort="Federalist"
-
-[fox_news]
-feed="https://moxie.foxnews.com/google-publisher/latest.xml"
-category="right"
-contains_articles=true
-name="Fox News"
-
-[lifesitenews]
-feed="https://www.lifesitenews.com/ldn/rss/headlines.xml"
-category="right"
-contains_articles=false
-name="LifeSiteNews"
-
 [not_the_bee]
 feed="https://notthebee.com/feed"
 category="right"
 contains_articles=false
 name="Not the Bee"

-[nypost]
-feed="https://nypost.com/news/feed"
+[daily_caller]
+feed="https://feeds.feedburner.com/dailycaller"
 category="right"
 contains_articles=false
-name="New York Post"
+name="Daily Caller"

-[oann]
-feed="https://www.oann.com/category/newsroom/feed"
+[american_conservative]
+feed="https://theamericanconservative.com/articles/feed/"
 category="right"
 contains_articles=true
-name="One America News Network"
-
-[redstate]
-feed="https://redstate.com/feed"
-category="right"
-contains_articles=false
-name="RedState"
-
-[washington_examiner]
-feed="https://feeds.feedburner.com/dcexaminer/Politics"
-category="right"
-contains_articles=true
-name="Washington Examiner"
+name="The American Conservative"
+sort="American Conservative"

 #[newsmax]
 #feed="https://www.newsmax.com/rss/Newsfront/16/"
--- a/stats.py
+++ b/stats.py
@ -1,21 +1,3 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-# Copyright (c) 2022 Samuel L Sloniker
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU General Public License as published by the Free Software
-# Foundation, either version 3 of the License, or (at your option) any later
-# version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program. If not, see <https://www.gnu.org/licenses/>. 
-
 import sqlite3
 import tomli