11 changed files with 88 additions and 380 deletions
--- a/.gitignore
+++ b/.gitignore
@ -141,4 +141,3 @@ cython_debug/
 *.db
 *.db-journal
 *.gptc
 build/
--- a/README.md
+++ b/README.md
@ -3,19 +3,10 @@
 A [GPTC](https://git.kj7rrv.com/kj7rrv/gptc) model to classify American news as
 right- or left-leaning
-## Scripts
+Inclusion of a site in this model is not an endorsement of the site.
 No scripts take any arguments.
 * `./download.py`: download new articles and add them to the database
 * `./compile.py`: compile GPTC model
 * `./export.py`: create `build/` directory with files for release
 * `./stats.py`: print statistics on article and source counts
 ## Sources
 Inclusion of a site in this model is not an endorsement of the site.
 ### Left
 * ABC News
@ -23,26 +14,19 @@ Inclusion of a site in this model is not an endorsement of the site.
 * CBS News
 * CNBC
 * CNN
 * Democracy Now!
 * HuffPost (formerly Huffington Post)
 * The Intercept
 * Los Angeles Times
 * PBS NewsHour
 * Slate
 * The Washington Post
 ### Right
 * The American Conservative
 * American Thinker
 * Breitbart
 * Daily Caller
 * Epoch Times
 * The Federalist
 * Fox News
 * LifeSiteNews
 * New York Post
 * Not the Bee
 * New York Post
 * One America News Network
 * RedState
 * Washington Examiner
--- a/analyses/constitutional_amendments.py
+++ b/analyses/constitutional_amendments.py
@ -1,70 +0,0 @@
 import gptc
 amendments = [
    ("1st", "First"),
    ("2nd", "Second"),
    ("3rd", "Third"),
    ("4th", "Fourth"),
    ("5th", "Fifth"),
    ("6th", "Sixth"),
    ("7th", "Seventh"),
    ("8th", "Eighth"),
    ("9th", "Ninth"),
    ("10th", "Tenth"),
    ("11th", "Eleventh"),
    ("12th", "Twelfth"),
    ("13th", "Thirteenth"),
    ("14th", "Fourteenth"),
    ("15th", "Fifteenth"),
    ("16th", "Sixteenth"),
    ("17th", "Seventeenth"),
    ("18th", "Eighteenth"),
    ("19th", "Nineteenth"),
    ("20th", "Twentieth"),
    ("21st", "Twenty-first"),
    ("22nd", "Twenty-second"),
    ("23rd", "Twenty-third"),
    ("24th", "Twenty-fourth"),
    ("25th", "Twenty-fifth"),
    ("26th", "Twenty-sixth"),
    ("27th", "Twenty-seventh"),
 ]
 with open("model.gptc", "rb") as f:
    model = gptc.deserialize(f)
 data = {}
 for number, name in amendments:
    number_data = model.get(number + " Amendment")
    name_data = model.get(name + " Amendment")
    if number_data and not name_data:
        data[name] = number_data
    elif name_data and not number_data:
        data[name] = name_data
    elif number_data and name_data:
        data[name] = {
            key: (number_data[key] + name_data[key]) / 2
            for key in number_data.keys()
        }
 classified_amendments = sorted(data.items(), key=lambda x: x[1]["left"])
 print("# Constitutional Amendment Analysis")
 print()
 print("""This is an analysis of which amendments to the U.S. Constitution are mentioned
 more in right- or left-leaning American news sources. Data do not necessarily
 correlate with support or opposition for the amendment among right- or
 left-leaning Americans.""")
 print()
 print("| Amendment      | Left  | Right |")
 print("+----------------+-------+-------+")
 for amendment, data in classified_amendments:
    percent_right = f"{data['right']*100:>4.1f}%"
    percent_left = f"{data['left']*100:>4.1f}%"
    amendment_padding = " "*(14 - len(amendment))
    print(f"| {amendment}{amendment_padding} | {percent_left} | {percent_right} |")
 print("+----------------+-------+-------+")
 print("| Amendment      | Left  | Right |")
--- a/analyses/states.py
+++ b/analyses/states.py
@ -1,85 +0,0 @@
 import gptc
 states = [
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming",
 ]
 with open("model.gptc", "rb") as f:
    model = gptc.deserialize(f)
 classified_states = []
 for state in states:
    classified_states.append((state, model.get(state),))
 classified_states.sort(key=lambda x: x[1]["left"])
 longest = max([len(state) for state in states])
 print("# State Analysis")
 print()
 print("""This is an analysis of which states are mentioned more in right- or left-
 leaning American news sources. Results do not necessarily correlate with the
 political views of residents of the states; for example, the predominantly
 liberal state of Oregon is mentioned more in right-leaning sources than in
 left-leaning ones.""")
 print()
 print("| State          | Left  | Right |")
 print("+----------------+-------+-------+")
 for state, data in classified_states:
    percent_right = f"{round(data['right']*1000)/10}%"
    percent_left = f"{round(data['left']*1000)/10}%"
    state_padding = " "*(longest - len(state))
    print(f"| {state}{state_padding} | {percent_left} | {percent_right} |")
 print("+----------------+-------+-------+")
 print("| State          | Left  | Right |")
--- a/compile.py
+++ b/compile.py
@ -1,28 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copyright (c) 2022 Samuel L Sloniker
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation, either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>.
 import sqlite3
 import tomli
 import gptc
 with open("compiler.toml", "rb") as f:
    config = tomli.load(f)
 con = sqlite3.connect("articles.db")
 con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
@ -32,11 +10,9 @@ raw_model = [
 ]
 with open("model.gptc", "w+b") as f:
-    gptc.compile(
+    f.write(
-        raw_model,
+        gptc.compile(raw_model, max_ngram_length=3, min_count=3).serialize()
-        max_ngram_length=config["max_ngram_length"],
+    )
        min_count=config["min_count"],
    ).serialize(f)
 con.commit()
 con.close()
--- a/compiler.toml
+++ b/compiler.toml
@ -1,2 +0,0 @@
 max_ngram_length=5
 min_count=5
--- a/download.py
+++ b/download.py
@ -1,21 +1,3 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copyright (c) 2022 Samuel L Sloniker
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation, either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>. 
 import feedparser
 import sqlite3
 import goose3
--- a/export.py
+++ b/export.py
@ -1,39 +0,0 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copyright (c) 2022 Samuel L Sloniker
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation, either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>. 
 import os
 import shutil
 import sqlite3
 try:
    shutil.rmtree("build")
 except FileNotFoundError:
    pass
 os.mkdir("build")
 shutil.copy("articles.db", "build/articles.db")
 shutil.copy("sources.toml", "build/sources.toml")
 shutil.copy("compiler.toml", "build/compiler.toml")
 shutil.copy("model.gptc", "build/model.gptc")
 con = sqlite3.Connection("build/articles.db")
 con.execute("UPDATE articles SET text = '***';")
 con.commit()
 con.execute("VACUUM;")
 con.commit()
 con.close()
--- a/export.sh
+++ b/export.sh
@ -0,0 +1,8 @@
 #!/bin/sh
 cp articles.db old_articles.db
 sqlite3 articles.db 'UPDATE articles SET text = "***";'
 sqlite3 articles.db 'VACUUM;'
 echo -n "Press enter when done..."
 read
 rm articles.db
 mv old_articles.db articles.db
--- a/sources.toml
+++ b/sources.toml
@ -5,11 +5,41 @@
 # ? Newsmax (read timeout errors)
 # ? Bloomberg (CAPTCHA on RSS feed?)
-[abc_news]
+[cnn]
-feed="https://abcnews.go.com/abcnews/usheadlines"
+feed="http://rss.cnn.com/rss/cnn_latest.rss"
 category="left"
 contains_articles=false
-name="ABC News"
+name="CNN"
 [huffpost]
 feed="https://chaski.huffpost.com/us/auto"
 category="left"
 contains_articles=false
 name="HuffPost"
 [cnbc]
 feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
 category="left"
 contains_articles=false
 name="CNBC"
 [pbs_newshour]
 feed="https://www.pbs.org/newshour/feeds/rss/headlines"
 category="left"
 contains_articles=false
 name="PBS NewsHour"
 [latimes]
 feed="https://www.latimes.com/local/rss2.0.xml"
 category="left"
 contains_articles=false
 name="Los Angeles Times"
 [cbs_news]
 feed="https://www.cbsnews.com/latest/rss/main"
 category="left"
 contains_articles=false
 name="CBS News"
 [atlantic]
 feed="https://www.theatlantic.com/feed/all/"
@ -18,35 +48,11 @@ contains_articles=true
 name="The Atlantic"
 sort="Atlantic"
-[cbs_news]
+[abc_news]
-feed="https://www.cbsnews.com/latest/rss/main"
+feed="https://abcnews.go.com/abcnews/usheadlines"
 category="left"
 contains_articles=false
-name="CBS News"
+name="ABC News"
 [cnbc]
 feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
 category="left"
 contains_articles=false
 name="CNBC"
 [cnn]
 feed="http://rss.cnn.com/rss/cnn_latest.rss"
 category="left"
 contains_articles=false
 name="CNN"
 [democracy_now]
 feed="https://www.democracynow.org/democracynow.rss"
 category="left"
 contains_articles=false
 name="Democracy Now!"
 [huffpost]
 feed="https://chaski.huffpost.com/us/auto"
 category="left"
 contains_articles=false
 name="HuffPost"
 [intercept]
 feed="https://theintercept.com/feed/?lang=en"
@ -55,42 +61,45 @@ contains_articles=true
 name="The Intercept"
 sort="Intercept"
 [latimes]
 feed="https://www.latimes.com/local/rss2.0.xml"
 category="left"
 contains_articles=false
 name="Los Angeles Times"
 [pbs_newshour]
 feed="https://www.pbs.org/newshour/feeds/rss/headlines"
 category="left"
 contains_articles=false
 name="PBS NewsHour"
 [slate]
 feed="http://www.slate.com/articles/news_and_politics.fulltext.all.10.rss"
 category="left"
 contains_articles=false
 name="Slate"
 [washington_post]
 feed="https://feeds.washingtonpost.com/rss/national"
 category="left"
 contains_articles=false
 name="The Washington Post"
 sort="Washington Post"
 #[bloomberg]
 #feed="https://www.bloomberg.com/politics/feeds/site.xml"
 #category="left"
 #contains_articles=false
-[american_conservative]
+[fox_news]
-feed="https://theamericanconservative.com/articles/feed/"
+feed="https://moxie.foxnews.com/google-publisher/latest.xml"
 category="right"
 contains_articles=true
-name="The American Conservative"
+name="Fox News"
-sort="American Conservative"
+
 [oann]
 feed="https://www.oann.com/category/newsroom/feed"
 category="right"
 contains_articles=true
 name="One America News Network"
 [nypost]
 feed="https://nypost.com/feed"
 category="right"
 contains_articles=false
 exclude=[
    { type="startswith", pattern="https://pagesix.com" },
    { type="startswith", pattern="https://decider.com" },
 ]
 name="New York Post"
 [federalist]
 feed="https://thefederalist.com/feed"
 category="right"
 contains_articles=false
 name="The Federalist"
 sort="Federalist"
 [washington_examiner]
 feed="https://feeds.feedburner.com/dcexaminer/Politics"
 category="right"
 contains_articles=true
 name="Washington Examiner"
 [american_thinker]
 feed="https://feeds.feedburner.com/americanthinker_articles"
@ -104,66 +113,30 @@ category="right"
 contains_articles=false
 name="Breitbart"
 [daily_caller]
 feed="https://feeds.feedburner.com/dailycaller"
 category="right"
 contains_articles=false
 name="Daily Caller"
 [epoch_times]
 feed="https://www.theepochtimes.com/feed/"
 category="right"
 contains_articles=false
 name="Epoch Times"
 [federalist]
 feed="https://thefederalist.com/feed"
 category="right"
 contains_articles=false
 name="The Federalist"
 sort="Federalist"
 [fox_news]
 feed="https://moxie.foxnews.com/google-publisher/latest.xml"
 category="right"
 contains_articles=true
 name="Fox News"
 [lifesitenews]
 feed="https://www.lifesitenews.com/ldn/rss/headlines.xml"
 category="right"
 contains_articles=false
 name="LifeSiteNews"
 [not_the_bee]
 feed="https://notthebee.com/feed"
 category="right"
 contains_articles=false
 name="Not the Bee"
-[nypost]
+[daily_caller]
-feed="https://nypost.com/news/feed"
+feed="https://feeds.feedburner.com/dailycaller"
 category="right"
 contains_articles=false
-name="New York Post"
+name="Daily Caller"
-[oann]
+[american_conservative]
-feed="https://www.oann.com/category/newsroom/feed"
+feed="https://theamericanconservative.com/articles/feed/"
 category="right"
 contains_articles=true
-name="One America News Network"
+name="The American Conservative"
-
+sort="American Conservative"
 [redstate]
 feed="https://redstate.com/feed"
 category="right"
 contains_articles=false
 name="RedState"
 [washington_examiner]
 feed="https://feeds.feedburner.com/dcexaminer/Politics"
 category="right"
 contains_articles=true
 name="Washington Examiner"
 #[newsmax]
 #feed="https://www.newsmax.com/rss/Newsfront/16/"
--- a/stats.py
+++ b/stats.py
@ -1,21 +1,3 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copyright (c) 2022 Samuel L Sloniker
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation, either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>. 
 import sqlite3
 import tomli