Export compiler.toml

Decrease ngram length to 5
Add analyses
12 changed files with 526 additions and 82 deletions
--- a/.gitignore
+++ b/.gitignore
@ -139,4 +139,6 @@ cython_debug/
 # Model
 *.db
 *.db-journal
 *.gptc
 build/
--- a/README.md
+++ b/README.md
@ -3,19 +3,46 @@
 A [GPTC](https://git.kj7rrv.com/kj7rrv/gptc) model to classify American news as
 right- or left-leaning
 ## Scripts
 No scripts take any arguments.
 * `./download.py`: download new articles and add them to the database
 * `./compile.py`: compile GPTC model
 * `./export.py`: create `build/` directory with files for release
 * `./stats.py`: print statistics on article and source counts
 ## Sources
 Inclusion of a site in this model is not an endorsement of the site.
 ### Left
 * ABC News
 * The Atlantic
 * CBS News
 * CNBC
 * CNN
 * Democracy Now!
 * HuffPost (formerly Huffington Post)
-* CNBC
+* The Intercept
 * PBS NewsHour
 * Los Angeles Times
 * PBS NewsHour
 * Slate
 * The Washington Post
 ### Right
 * The American Conservative
 * American Thinker
 * Breitbart
 * Daily Caller
 * Epoch Times
 * The Federalist
 * Fox News
-* One America News Network
+* LifeSiteNews
 * New York Post
-* The Federalist
+* Not the Bee
 * One America News Network
 * RedState
 * Washington Examiner
--- a/analyses/constitutional_amendments.py
+++ b/analyses/constitutional_amendments.py
@ -0,0 +1,70 @@
 import gptc
 amendments = [
    ("1st", "First"),
    ("2nd", "Second"),
    ("3rd", "Third"),
    ("4th", "Fourth"),
    ("5th", "Fifth"),
    ("6th", "Sixth"),
    ("7th", "Seventh"),
    ("8th", "Eighth"),
    ("9th", "Ninth"),
    ("10th", "Tenth"),
    ("11th", "Eleventh"),
    ("12th", "Twelfth"),
    ("13th", "Thirteenth"),
    ("14th", "Fourteenth"),
    ("15th", "Fifteenth"),
    ("16th", "Sixteenth"),
    ("17th", "Seventeenth"),
    ("18th", "Eighteenth"),
    ("19th", "Nineteenth"),
    ("20th", "Twentieth"),
    ("21st", "Twenty-first"),
    ("22nd", "Twenty-second"),
    ("23rd", "Twenty-third"),
    ("24th", "Twenty-fourth"),
    ("25th", "Twenty-fifth"),
    ("26th", "Twenty-sixth"),
    ("27th", "Twenty-seventh"),
 ]
 with open("model.gptc", "rb") as f:
    model = gptc.deserialize(f)
 data = {}
 for number, name in amendments:
    number_data = model.get(number + " Amendment")
    name_data = model.get(name + " Amendment")
    if number_data and not name_data:
        data[name] = number_data
    elif name_data and not number_data:
        data[name] = name_data
    elif number_data and name_data:
        data[name] = {
            key: (number_data[key] + name_data[key]) / 2
            for key in number_data.keys()
        }
 classified_amendments = sorted(data.items(), key=lambda x: x[1]["left"])
 print("# Constitutional Amendment Analysis")
 print()
 print("""This is an analysis of which amendments to the U.S. Constitution are mentioned
 more in right- or left-leaning American news sources. Data do not necessarily
 correlate with support or opposition for the amendment among right- or
 left-leaning Americans.""")
 print()
 print("| Amendment      | Left  | Right |")
 print("+----------------+-------+-------+")
 for amendment, data in classified_amendments:
    percent_right = f"{data['right']*100:>4.1f}%"
    percent_left = f"{data['left']*100:>4.1f}%"
    amendment_padding = " "*(14 - len(amendment))
    print(f"| {amendment}{amendment_padding} | {percent_left} | {percent_right} |")
 print("+----------------+-------+-------+")
 print("| Amendment      | Left  | Right |")
--- a/analyses/states.py
+++ b/analyses/states.py
@ -0,0 +1,85 @@
 import gptc
 states = [
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming",
 ]
 with open("model.gptc", "rb") as f:
    model = gptc.deserialize(f)
 classified_states = []
 for state in states:
    classified_states.append((state, model.get(state),))
 classified_states.sort(key=lambda x: x[1]["left"])
 longest = max([len(state) for state in states])
 print("# State Analysis")
 print()
 print("""This is an analysis of which states are mentioned more in right- or left-
 leaning American news sources. Results do not necessarily correlate with the
 political views of residents of the states; for example, the predominantly
 liberal state of Oregon is mentioned more in right-leaning sources than in
 left-leaning ones.""")
 print()
 print("| State          | Left  | Right |")
 print("+----------------+-------+-------+")
 for state, data in classified_states:
    percent_right = f"{round(data['right']*1000)/10}%"
    percent_left = f"{round(data['left']*1000)/10}%"
    state_padding = " "*(longest - len(state))
    print(f"| {state}{state_padding} | {percent_left} | {percent_right} |")
 print("+----------------+-------+-------+")
 print("| State          | Left  | Right |")
--- a/compile.py
+++ b/compile.py
@ -0,0 +1,42 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copyright (c) 2022 Samuel L Sloniker
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation, either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>.
 import sqlite3
 import tomli
 import gptc
 with open("compiler.toml", "rb") as f:
    config = tomli.load(f)
 con = sqlite3.connect("articles.db")
 con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
 raw_model = [
    {"text": i[0], "category": i[1]}
    for i in con.execute("SELECT text, category FROM articles;")
 ]
 with open("model.gptc", "w+b") as f:
    gptc.compile(
        raw_model,
        max_ngram_length=config["max_ngram_length"],
        min_count=config["min_count"],
    ).serialize(f)
 con.commit()
 con.close()
--- a/compiler.toml
+++ b/compiler.toml
@ -0,0 +1,2 @@
 max_ngram_length=5
 min_count=5
--- a/download.py
+++ b/download.py
@ -1,11 +1,37 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copyright (c) 2022 Samuel L Sloniker
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation, either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>. 
 import feedparser
 import hashlib
 import sqlite3
 import goose3
 import tomli
 import gptc
 import bs4
 def matches(string, checks):
    for check in checks:
        if check["type"] == "startswith" and string.startswith(
            check["pattern"]
        ):
            return True
    return False
 with open("sources.toml", "rb") as f:
    sources = tomli.load(f)
@ -39,49 +65,43 @@ try:
            entry
            for entry in feedparser.parse(url)["entries"]
            if not entry["link"] in known
            and not matches(entry["link"], config.get("exclude", []))
        ]
        print(f"Fetched feed. Found {len(entries)} new articles.")
        if contains_articles:
            for entry in entries:
-                print(f"Saving {entry['link']}")
+                try:
-                con.execute(
+                    print(f"Saving {entry['link']}")
-                    "INSERT INTO articles VALUES (?, ?, ?, ?);",
+                    con.execute(
-                    (
+                        "INSERT INTO articles VALUES (?, ?, ?, ?);",
-                        name,
+                        (
-                        category,
+                            name,
-                        entry["link"],
+                            category,
-                        bs4.BeautifulSoup(
+                            entry["link"],
-                            entry["content"][0]["value"], features="lxml"
+                            bs4.BeautifulSoup(
-                        ).text,
+                                entry["content"][0]["value"], features="lxml"
-                    ),
+                            ).text,
-                )
+                        ),
                    )
                except KeyError:
                    print("Not enough information. Skipping.")
        else:
            for entry in entries:
-                print(f"Downloading {entry['link']}...")
+                try:
-                con.execute(
+                    print(f"Downloading {entry['link']}...")
-                    "INSERT INTO articles VALUES (?, ?, ?, ?);",
+                    con.execute(
-                    (
+                        "INSERT INTO articles VALUES (?, ?, ?, ?);",
-                        name,
+                        (
-                        category,
+                            name,
-                        entry["link"],
+                            category,
-                        g.extract(entry["link"]).cleaned_text,
+                            entry["link"],
-                    ),
+                            g.extract(entry["link"]).cleaned_text,
-                )
+                        ),
-                print(f"Done downloading.")
+                    )
                    print(f"Done downloading.")
                except KeyError:
                    print("Not enough information. Skipping.")
 finally:
    con.commit()
    print("Compiling model...")
    raw_model = [
        {"text": i[0], "category": i[1]}
        for i in con.execute("SELECT text, category FROM articles;")
    ]
    with open("model.gptc", "w+b") as f:
        f.write(
            gptc.compile(raw_model, max_ngram_length=3, min_count=3).serialize()
        )
    con.close()
--- a/export.py
+++ b/export.py
@ -0,0 +1,39 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copyright (c) 2022 Samuel L Sloniker
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation, either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>. 
 import os
 import shutil
 import sqlite3
 try:
    shutil.rmtree("build")
 except FileNotFoundError:
    pass
 os.mkdir("build")
 shutil.copy("articles.db", "build/articles.db")
 shutil.copy("sources.toml", "build/sources.toml")
 shutil.copy("compiler.toml", "build/compiler.toml")
 shutil.copy("model.gptc", "build/model.gptc")
 con = sqlite3.Connection("build/articles.db")
 con.execute("UPDATE articles SET text = '***';")
 con.commit()
 con.execute("VACUUM;")
 con.commit()
 con.close()
--- a/export.sh
+++ b/export.sh
@ -1,8 +0,0 @@
 #!/bin/sh
 cp articles.db old_articles.db
 sqlite3 articles.db 'UPDATE articles SET text = "***";'
 sqlite3 articles.db 'VACUUM;'
 echo -n "Press enter when done..."
 read
 rm articles.db
 mv old_articles.db articles.db
--- a/sources.toml
+++ b/sources.toml
@ -5,55 +5,165 @@
 # ? Newsmax (read timeout errors)
 # ? Bloomberg (CAPTCHA on RSS feed?)
 [abc_news]
 feed="https://abcnews.go.com/abcnews/usheadlines"
 category="left"
 contains_articles=false
 name="ABC News"
 [atlantic]
 feed="https://www.theatlantic.com/feed/all/"
 category="left"
 contains_articles=true
 name="The Atlantic"
 sort="Atlantic"
 [cbs_news]
 feed="https://www.cbsnews.com/latest/rss/main"
 category="left"
 contains_articles=false
 name="CBS News"
 [cnbc]
 feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
 category="left"
 contains_articles=false
 name="CNBC"
 [cnn]
 feed="http://rss.cnn.com/rss/cnn_latest.rss"
 category="left"
 contains_articles=false
 name="CNN"
 [democracy_now]
 feed="https://www.democracynow.org/democracynow.rss"
 category="left"
 contains_articles=false
 name="Democracy Now!"
 [huffpost]
 feed="https://chaski.huffpost.com/us/auto"
 category="left"
 contains_articles=false
 name="HuffPost"
-[cnbc]
+[intercept]
-feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
+feed="https://theintercept.com/feed/?lang=en"
 category="left"
 contains_articles=true
 name="The Intercept"
 sort="Intercept"
 [latimes]
 feed="https://www.latimes.com/local/rss2.0.xml"
 category="left"
 contains_articles=false
 name="Los Angeles Times"
 [pbs_newshour]
 feed="https://www.pbs.org/newshour/feeds/rss/headlines"
 category="left"
 contains_articles=false
 name="PBS NewsHour"
-[latimes]
+[slate]
-feed="https://www.latimes.com/local/rss2.0.xml"
+feed="http://www.slate.com/articles/news_and_politics.fulltext.all.10.rss"
 category="left"
 contains_articles=false
 name="Slate"
 [washington_post]
 feed="https://feeds.washingtonpost.com/rss/national"
 category="left"
 contains_articles=false
 name="The Washington Post"
 sort="Washington Post"
 #[bloomberg]
 #feed="https://www.bloomberg.com/politics/feeds/site.xml"
 #category="left"
 #contains_articles=false
-[fox]
+[american_conservative]
 feed="https://theamericanconservative.com/articles/feed/"
 category="right"
 contains_articles=true
 name="The American Conservative"
 sort="American Conservative"
 [american_thinker]
 feed="https://feeds.feedburner.com/americanthinker_articles"
 category="right"
 contains_articles=false
 name="American Thinker"
 [breitbart]
 feed="https://feeds.feedburner.com/breitbart/"
 category="right"
 contains_articles=false
 name="Breitbart"
 [daily_caller]
 feed="https://feeds.feedburner.com/dailycaller"
 category="right"
 contains_articles=false
 name="Daily Caller"
 [epoch_times]
 feed="https://www.theepochtimes.com/feed/"
 category="right"
 contains_articles=false
 name="Epoch Times"
 [federalist]
 feed="https://thefederalist.com/feed"
 category="right"
 contains_articles=false
 name="The Federalist"
 sort="Federalist"
 [fox_news]
 feed="https://moxie.foxnews.com/google-publisher/latest.xml"
 category="right"
 contains_articles=true
 name="Fox News"
 [lifesitenews]
 feed="https://www.lifesitenews.com/ldn/rss/headlines.xml"
 category="right"
 contains_articles=false
 name="LifeSiteNews"
 [not_the_bee]
 feed="https://notthebee.com/feed"
 category="right"
 contains_articles=false
 name="Not the Bee"
 [nypost]
 feed="https://nypost.com/news/feed"
 category="right"
 contains_articles=false
 name="New York Post"
 [oann]
 feed="https://www.oann.com/category/newsroom/feed"
 category="right"
 contains_articles=true
 name="One America News Network"
-[nypost]
+[redstate]
-feed="https://nypost.com/feed"
+feed="https://redstate.com/feed"
 category="right"
 contains_articles=false
 name="RedState"
-[federalist]
+[washington_examiner]
-feed="https://thefederalist.com/feed"
+feed="https://feeds.feedburner.com/dcexaminer/Politics"
 category="right"
-contains_articles=false
+contains_articles=true
 name="Washington Examiner"
 #[newsmax]
 #feed="https://www.newsmax.com/rss/Newsfront/16/"
--- a/stats.py
+++ b/stats.py
@ -0,0 +1,77 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copyright (c) 2022 Samuel L Sloniker
 #
 # This program is free software: you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free Software
 # Foundation, either version 3 of the License, or (at your option) any later
 # version.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 # details.
 #
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <https://www.gnu.org/licenses/>. 
 import sqlite3
 import tomli
 with open("sources.toml", "rb") as f:
    sources = tomli.load(f)
 con = sqlite3.connect("articles.db")
 con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
 article_count = len(list(con.execute("SELECT url FROM articles")))
 left_article_count = len(list(con.execute("SELECT url FROM articles WHERE category = 'left'")))
 right_article_count = len(list(con.execute("SELECT url FROM articles WHERE category = 'right'")))
 source_count = 0
 left_source_count = 0
 right_source_count = 0
 left_sources = []
 right_sources = []
 for source_id, source_info in sources.items():
    source_count += 1
    if source_info["category"] == "left":
        left_source_count += 1
        source_list = left_sources
    else:
        right_source_count += 1
        source_list = right_sources
    source_list.append({
        "name": source_info["name"],
        "sort": source_info.get("sort", source_info["name"]),
        "count": len(list(con.execute("SELECT url FROM articles WHERE source = ?", (source_id,)))),
        })
 left_sources.sort(key=lambda x: x["sort"])
 right_sources.sort(key=lambda x: x["sort"])
 left_breakdown = "\n".join([f"* {source['name']}: {source['count']}" for source in left_sources])
 right_breakdown = "\n".join([f"* {source['name']}: {source['count']}" for source in right_sources])
 con.commit()
 con.close()
 print(f"""\
 This model contains a total of {article_count} articles from {source_count} sources.
 ## Left
 {left_breakdown}
 Left total: {left_article_count} articles from {left_source_count} sources
 ## Right
 {right_breakdown}
 Right total: {right_article_count} articles from {right_source_count} sources""")
--- a/stats.sh
+++ b/stats.sh
@ -1,22 +0,0 @@
 #!/bin/bash
 total=$(sqlite3 articles.db "SELECT url FROM articles" | wc -l)
 left=$(sqlite3 articles.db "SELECT url FROM articles WHERE category = 'left'" | wc -l)
 right=$(sqlite3 articles.db "SELECT url FROM articles WHERE category = 'right'" | wc -l)
 left_sources=$(sqlite3 articles.db "SELECT source FROM articles WHERE category = 'left'" | sort | uniq)
 right_sources=$(sqlite3 articles.db "SELECT source FROM articles WHERE category = 'right'" | sort | uniq)
 echo "This model contains a total of $total articles ($left left, $right right)."
 echo ""
 echo "## Left"
 echo ""
 for i in $left_sources; do
    echo "* $i: $(sqlite3 articles.db "SELECT url FROM articles WHERE source = '$i'" | wc -l)"
 done
 echo ""
 echo "## Right"
 echo ""
 for i in $right_sources; do
    echo "* $i: $(sqlite3 articles.db "SELECT url FROM articles WHERE source = '$i'" | wc -l)"
 done
Author	SHA1	Message	Date
Samuel Sloniker	9c66b18cfe	Export compiler.toml	1 year ago
Samuel Sloniker	fe088822e1	Decrease ngram length to 5	1 year ago
Samuel Sloniker	1daab919ea	Add analyses	1 year ago
Samuel Sloniker	4eeb8d2d17	Use GPTC v4.0.0	1 year ago
Samuel Sloniker	f1ccaaabab	Add LifeSiteNews	1 year ago
Samuel Sloniker	9d82b07f17	Add RedState	1 year ago
Samuel Sloniker	f25594d771	Reduce model size	1 year ago
Samuel Sloniker	af9e5e92a3	Use 10-grams	1 year ago
Samuel Sloniker	b485780738	Fix sources list in README	1 year ago
Samuel Sloniker	82846f39ba	Add Democracy Now!	1 year ago
Samuel Sloniker	5f3a2977f1	Add Slate	1 year ago
Samuel Sloniker	314bdef1c5	Compiler settings	1 year ago
Samuel Sloniker	28f81c9a63	Change minimum use count to 5	1 year ago
Samuel Sloniker	06190f5101	Add Washington Post	1 year ago
Samuel Sloniker	68c8949005	Sort sources in config file	1 year ago
Samuel Sloniker	29d77b5393	Only get news from New York Post	1 year ago
Samuel Sloniker	9f3abd8641	Add license headers to code	1 year ago
Samuel Sloniker	17bb8a4f3f	Document scripts	1 year ago
Samuel Sloniker	f0b93cd2f6	Make scripts executable	1 year ago
Samuel Sloniker	aa8fa31195	Rewrite export script in Python; use build dir	1 year ago
Samuel Sloniker	e5e046b70d	Add sources	1 year ago
Samuel Sloniker	54d97a3a16	Remove extra newline from stats	2 years ago
Samuel Sloniker	487b087910	Rewrite stats script in Python	2 years ago
Samuel Sloniker	43faa6139a	Format download.py	2 years ago
Samuel Sloniker	5f7fd0ccb5	Add names and sort keys	2 years ago
Samuel Sloniker	af5f3c3df1	Split download into download and compile	2 years ago
Samuel Sloniker	a96d474e37	Update gitignore	2 years ago
Samuel Sloniker	ee8189d476	Update stats script	2 years ago
Samuel Sloniker	1d2cfab68c	More sources	2 years ago
Samuel Sloniker	c5cc6d78f9	More sources	2 years ago
Samuel Sloniker	e206210ec5	Disclaimer	2 years ago
Samuel Sloniker	7bf17b150e	Add ability to exclude articles by URL	2 years ago