Export compiler.toml

Decrease ngram length to 5
Add analyses
2023-01-08 08:48:20 -08:00 · 2023-01-08 08:47:59 -08:00 · 2023-01-08 08:45:22 -08:00 · 2022-12-24 12:29:00 -08:00 · 2022-12-20 17:50:21 -08:00 · 2022-12-20 17:31:12 -08:00
11 changed files with 380 additions and 88 deletions
--- a/.gitignore
+++ b/.gitignore
@ -141,3 +141,4 @@ cython_debug/
 *.db
 *.db-journal
 *.gptc
+build/
--- a/README.md
+++ b/README.md
@ -3,10 +3,19 @@
 A [GPTC](https://git.kj7rrv.com/kj7rrv/gptc) model to classify American news as
 right- or left-leaning

-Inclusion of a site in this model is not an endorsement of the site.
+## Scripts
+
+No scripts take any arguments.
+
+* `./download.py`: download new articles and add them to the database
+* `./compile.py`: compile GPTC model
+* `./export.py`: create `build/` directory with files for release
+* `./stats.py`: print statistics on article and source counts

 ## Sources

+Inclusion of a site in this model is not an endorsement of the site.
+
 ### Left

 * ABC News
@ -14,19 +23,26 @@ Inclusion of a site in this model is not an endorsement of the site.
 * CBS News
 * CNBC
 * CNN
+* Democracy Now!
 * HuffPost (formerly Huffington Post)
+* The Intercept
 * Los Angeles Times
 * PBS NewsHour
+* Slate
+* The Washington Post

 ### Right

+* The American Conservative
 * American Thinker
 * Breitbart
 * Daily Caller
 * Epoch Times
 * The Federalist
 * Fox News
-* Not the Bee
+* LifeSiteNews
 * New York Post
+* Not the Bee
 * One America News Network
+* RedState
 * Washington Examiner
--- a/analyses/constitutional_amendments.py
+++ b/analyses/constitutional_amendments.py
@ -0,0 +1,70 @@
+import gptc
+
+amendments = [
+    ("1st", "First"),
+    ("2nd", "Second"),
+    ("3rd", "Third"),
+    ("4th", "Fourth"),
+    ("5th", "Fifth"),
+    ("6th", "Sixth"),
+    ("7th", "Seventh"),
+    ("8th", "Eighth"),
+    ("9th", "Ninth"),
+    ("10th", "Tenth"),
+    ("11th", "Eleventh"),
+    ("12th", "Twelfth"),
+    ("13th", "Thirteenth"),
+    ("14th", "Fourteenth"),
+    ("15th", "Fifteenth"),
+    ("16th", "Sixteenth"),
+    ("17th", "Seventeenth"),
+    ("18th", "Eighteenth"),
+    ("19th", "Nineteenth"),
+    ("20th", "Twentieth"),
+    ("21st", "Twenty-first"),
+    ("22nd", "Twenty-second"),
+    ("23rd", "Twenty-third"),
+    ("24th", "Twenty-fourth"),
+    ("25th", "Twenty-fifth"),
+    ("26th", "Twenty-sixth"),
+    ("27th", "Twenty-seventh"),
+]
+
+with open("model.gptc", "rb") as f:
+    model = gptc.deserialize(f)
+
+data = {}
+
+for number, name in amendments:
+    number_data = model.get(number + " Amendment")
+    name_data = model.get(name + " Amendment")
+
+    if number_data and not name_data:
+        data[name] = number_data
+    elif name_data and not number_data:
+        data[name] = name_data
+    elif number_data and name_data:
+        data[name] = {
+            key: (number_data[key] + name_data[key]) / 2
+            for key in number_data.keys()
+        }
+
+classified_amendments = sorted(data.items(), key=lambda x: x[1]["left"])
+
+print("# Constitutional Amendment Analysis")
+print()
+print("""This is an analysis of which amendments to the U.S. Constitution are mentioned
+more in right- or left-leaning American news sources. Data do not necessarily
+correlate with support or opposition for the amendment among right- or
+left-leaning Americans.""")
+print()
+print("| Amendment      | Left  | Right |")
+print("+----------------+-------+-------+")
+for amendment, data in classified_amendments:
+    percent_right = f"{data['right']*100:>4.1f}%"
+    percent_left = f"{data['left']*100:>4.1f}%"
+
+    amendment_padding = " "*(14 - len(amendment))
+    print(f"| {amendment}{amendment_padding} | {percent_left} | {percent_right} |")
+print("+----------------+-------+-------+")
+print("| Amendment      | Left  | Right |")
--- a/analyses/states.py
+++ b/analyses/states.py
@ -0,0 +1,85 @@
+import gptc
+
+states = [
+    "Alabama",
+    "Alaska",
+    "Arizona",
+    "Arkansas",
+    "California",
+    "Colorado",
+    "Connecticut",
+    "Delaware",
+    "Florida",
+    "Georgia",
+    "Hawaii",
+    "Idaho",
+    "Illinois",
+    "Indiana",
+    "Iowa",
+    "Kansas",
+    "Kentucky",
+    "Louisiana",
+    "Maine",
+    "Maryland",
+    "Massachusetts",
+    "Michigan",
+    "Minnesota",
+    "Mississippi",
+    "Missouri",
+    "Montana",
+    "Nebraska",
+    "Nevada",
+    "New Hampshire",
+    "New Jersey",
+    "New Mexico",
+    "New York",
+    "North Carolina",
+    "North Dakota",
+    "Ohio",
+    "Oklahoma",
+    "Oregon",
+    "Pennsylvania",
+    "Rhode Island",
+    "South Carolina",
+    "South Dakota",
+    "Tennessee",
+    "Texas",
+    "Utah",
+    "Vermont",
+    "Virginia",
+    "Washington",
+    "West Virginia",
+    "Wisconsin",
+    "Wyoming",
+]
+
+with open("model.gptc", "rb") as f:
+    model = gptc.deserialize(f)
+
+classified_states = []
+
+for state in states:
+    classified_states.append((state, model.get(state),))
+
+classified_states.sort(key=lambda x: x[1]["left"])
+
+longest = max([len(state) for state in states])
+
+print("# State Analysis")
+print()
+print("""This is an analysis of which states are mentioned more in right- or left-
+leaning American news sources. Results do not necessarily correlate with the
+political views of residents of the states; for example, the predominantly
+liberal state of Oregon is mentioned more in right-leaning sources than in
+left-leaning ones.""")
+print()
+print("| State          | Left  | Right |")
+print("+----------------+-------+-------+")
+for state, data in classified_states:
+    percent_right = f"{round(data['right']*1000)/10}%"
+    percent_left = f"{round(data['left']*1000)/10}%"
+
+    state_padding = " "*(longest - len(state))
+    print(f"| {state}{state_padding} | {percent_left} | {percent_right} |")
+print("+----------------+-------+-------+")
+print("| State          | Left  | Right |")
--- a/compile.py
+++ b/compile.py
@ -1,6 +1,28 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# Copyright (c) 2022 Samuel L Sloniker
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <https://www.gnu.org/licenses/>.
+
 import sqlite3
+import tomli
 import gptc

+with open("compiler.toml", "rb") as f:
+    config = tomli.load(f)
+
 con = sqlite3.connect("articles.db")
 con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")

@ -10,9 +32,11 @@ raw_model = [
 ]

 with open("model.gptc", "w+b") as f:
-    f.write(
-        gptc.compile(raw_model, max_ngram_length=3, min_count=3).serialize()
-    )
+    gptc.compile(
+        raw_model,
+        max_ngram_length=config["max_ngram_length"],
+        min_count=config["min_count"],
+    ).serialize(f)

 con.commit()
 con.close()
--- a/compiler.toml
+++ b/compiler.toml
@ -0,0 +1,2 @@
+max_ngram_length=5
+min_count=5
--- a/download.py
+++ b/download.py
@ -1,3 +1,21 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# Copyright (c) 2022 Samuel L Sloniker
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <https://www.gnu.org/licenses/>. 
+
 import feedparser
 import sqlite3
 import goose3
--- a/export.py
+++ b/export.py
@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# Copyright (c) 2022 Samuel L Sloniker
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <https://www.gnu.org/licenses/>. 
+
+import os
+import shutil
+import sqlite3
+
+try:
+    shutil.rmtree("build")
+except FileNotFoundError:
+    pass
+os.mkdir("build")
+
+shutil.copy("articles.db", "build/articles.db")
+shutil.copy("sources.toml", "build/sources.toml")
+shutil.copy("compiler.toml", "build/compiler.toml")
+shutil.copy("model.gptc", "build/model.gptc")
+
+con = sqlite3.Connection("build/articles.db")
+con.execute("UPDATE articles SET text = '***';")
+con.commit()
+con.execute("VACUUM;")
+con.commit()
+con.close()
--- a/export.sh
+++ b/export.sh
@ -1,8 +0,0 @@
-#!/bin/sh
-cp articles.db old_articles.db
-sqlite3 articles.db 'UPDATE articles SET text = "***";'
-sqlite3 articles.db 'VACUUM;'
-echo -n "Press enter when done..."
-read
-rm articles.db
-mv old_articles.db articles.db
--- a/sources.toml
+++ b/sources.toml
@ -5,41 +5,11 @@
 # ? Newsmax (read timeout errors)
 # ? Bloomberg (CAPTCHA on RSS feed?)

-[cnn]
-feed="http://rss.cnn.com/rss/cnn_latest.rss"
+[abc_news]
+feed="https://abcnews.go.com/abcnews/usheadlines"
 category="left"
 contains_articles=false
-name="CNN"
-
-[huffpost]
-feed="https://chaski.huffpost.com/us/auto"
-category="left"
-contains_articles=false
-name="HuffPost"
-
-[cnbc]
-feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
-category="left"
-contains_articles=false
-name="CNBC"
-
-[pbs_newshour]
-feed="https://www.pbs.org/newshour/feeds/rss/headlines"
-category="left"
-contains_articles=false
-name="PBS NewsHour"
-
-[latimes]
-feed="https://www.latimes.com/local/rss2.0.xml"
-category="left"
-contains_articles=false
-name="Los Angeles Times"
-
-[cbs_news]
-feed="https://www.cbsnews.com/latest/rss/main"
-category="left"
-contains_articles=false
-name="CBS News"
+name="ABC News"

 [atlantic]
 feed="https://www.theatlantic.com/feed/all/"
@ -48,11 +18,35 @@ contains_articles=true
 name="The Atlantic"
 sort="Atlantic"

-[abc_news]
-feed="https://abcnews.go.com/abcnews/usheadlines"
+[cbs_news]
+feed="https://www.cbsnews.com/latest/rss/main"
 category="left"
 contains_articles=false
-name="ABC News"
+name="CBS News"
+
+[cnbc]
+feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
+category="left"
+contains_articles=false
+name="CNBC"
+
+[cnn]
+feed="http://rss.cnn.com/rss/cnn_latest.rss"
+category="left"
+contains_articles=false
+name="CNN"
+
+[democracy_now]
+feed="https://www.democracynow.org/democracynow.rss"
+category="left"
+contains_articles=false
+name="Democracy Now!"
+
+[huffpost]
+feed="https://chaski.huffpost.com/us/auto"
+category="left"
+contains_articles=false
+name="HuffPost"

 [intercept]
 feed="https://theintercept.com/feed/?lang=en"
@ -61,45 +55,42 @@ contains_articles=true
 name="The Intercept"
 sort="Intercept"

+[latimes]
+feed="https://www.latimes.com/local/rss2.0.xml"
+category="left"
+contains_articles=false
+name="Los Angeles Times"
+
+[pbs_newshour]
+feed="https://www.pbs.org/newshour/feeds/rss/headlines"
+category="left"
+contains_articles=false
+name="PBS NewsHour"
+
+[slate]
+feed="http://www.slate.com/articles/news_and_politics.fulltext.all.10.rss"
+category="left"
+contains_articles=false
+name="Slate"
+
+[washington_post]
+feed="https://feeds.washingtonpost.com/rss/national"
+category="left"
+contains_articles=false
+name="The Washington Post"
+sort="Washington Post"
+
 #[bloomberg]
 #feed="https://www.bloomberg.com/politics/feeds/site.xml"
 #category="left"
 #contains_articles=false

-[fox_news]
-feed="https://moxie.foxnews.com/google-publisher/latest.xml"
+[american_conservative]
+feed="https://theamericanconservative.com/articles/feed/"
 category="right"
 contains_articles=true
-name="Fox News"
-
-[oann]
-feed="https://www.oann.com/category/newsroom/feed"
-category="right"
-contains_articles=true
-name="One America News Network"
-
-[nypost]
-feed="https://nypost.com/feed"
-category="right"
-contains_articles=false
-exclude=[
-    { type="startswith", pattern="https://pagesix.com" },
-    { type="startswith", pattern="https://decider.com" },
-]
-name="New York Post"
-
-[federalist]
-feed="https://thefederalist.com/feed"
-category="right"
-contains_articles=false
-name="The Federalist"
-sort="Federalist"
-
-[washington_examiner]
-feed="https://feeds.feedburner.com/dcexaminer/Politics"
-category="right"
-contains_articles=true
-name="Washington Examiner"
+name="The American Conservative"
+sort="American Conservative"

 [american_thinker]
 feed="https://feeds.feedburner.com/americanthinker_articles"
@ -113,30 +104,66 @@ category="right"
 contains_articles=false
 name="Breitbart"

+[daily_caller]
+feed="https://feeds.feedburner.com/dailycaller"
+category="right"
+contains_articles=false
+name="Daily Caller"
+
 [epoch_times]
 feed="https://www.theepochtimes.com/feed/"
 category="right"
 contains_articles=false
 name="Epoch Times"

+[federalist]
+feed="https://thefederalist.com/feed"
+category="right"
+contains_articles=false
+name="The Federalist"
+sort="Federalist"
+
+[fox_news]
+feed="https://moxie.foxnews.com/google-publisher/latest.xml"
+category="right"
+contains_articles=true
+name="Fox News"
+
+[lifesitenews]
+feed="https://www.lifesitenews.com/ldn/rss/headlines.xml"
+category="right"
+contains_articles=false
+name="LifeSiteNews"
+
 [not_the_bee]
 feed="https://notthebee.com/feed"
 category="right"
 contains_articles=false
 name="Not the Bee"

-[daily_caller]
-feed="https://feeds.feedburner.com/dailycaller"
+[nypost]
+feed="https://nypost.com/news/feed"
 category="right"
 contains_articles=false
-name="Daily Caller"
+name="New York Post"

-[american_conservative]
-feed="https://theamericanconservative.com/articles/feed/"
+[oann]
+feed="https://www.oann.com/category/newsroom/feed"
 category="right"
 contains_articles=true
-name="The American Conservative"
-sort="American Conservative"
+name="One America News Network"
+
+[redstate]
+feed="https://redstate.com/feed"
+category="right"
+contains_articles=false
+name="RedState"
+
+[washington_examiner]
+feed="https://feeds.feedburner.com/dcexaminer/Politics"
+category="right"
+contains_articles=true
+name="Washington Examiner"

 #[newsmax]
 #feed="https://www.newsmax.com/rss/Newsfront/16/"
--- a/stats.py
+++ b/stats.py
@ -1,3 +1,21 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# Copyright (c) 2022 Samuel L Sloniker
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program. If not, see <https://www.gnu.org/licenses/>. 
+
 import sqlite3
 import tomli
Author	SHA1	Message	Date
Samuel Sloniker	9c66b18cfe	Export compiler.toml	2023-01-08 08:48:20 -08:00
Samuel Sloniker	fe088822e1	Decrease ngram length to 5	2023-01-08 08:47:59 -08:00
Samuel Sloniker	1daab919ea	Add analyses	2023-01-08 08:45:22 -08:00
Samuel Sloniker	4eeb8d2d17	Use GPTC v4.0.0	2022-12-24 12:29:00 -08:00
Samuel Sloniker	f1ccaaabab	Add LifeSiteNews	2022-12-20 17:50:21 -08:00
Samuel Sloniker	9d82b07f17	Add RedState	2022-12-20 17:31:12 -08:00
Samuel Sloniker	f25594d771	Reduce model size	2022-11-28 18:00:31 -08:00
Samuel Sloniker	af9e5e92a3	Use 10-grams	2022-11-28 17:40:10 -08:00
Samuel Sloniker	b485780738	Fix sources list in README	2022-11-26 13:45:41 -08:00
Samuel Sloniker	82846f39ba	Add Democracy Now!	2022-11-26 13:45:34 -08:00
Samuel Sloniker	5f3a2977f1	Add Slate	2022-11-26 13:05:45 -08:00
Samuel Sloniker	314bdef1c5	Compiler settings	2022-11-26 13:05:37 -08:00
Samuel Sloniker	28f81c9a63	Change minimum use count to 5	2022-11-26 10:51:14 -08:00
Samuel Sloniker	06190f5101	Add Washington Post	2022-11-26 09:41:42 -08:00
Samuel Sloniker	68c8949005	Sort sources in config file	2022-11-25 17:04:17 -08:00
Samuel Sloniker	29d77b5393	Only get news from New York Post	2022-11-25 12:48:46 -08:00
Samuel Sloniker	9f3abd8641	Add license headers to code	2022-11-25 09:35:27 -08:00
Samuel Sloniker	17bb8a4f3f	Document scripts	2022-11-25 09:33:02 -08:00
Samuel Sloniker	f0b93cd2f6	Make scripts executable	2022-11-25 09:29:39 -08:00
Samuel Sloniker	aa8fa31195	Rewrite export script in Python; use build dir	2022-11-25 09:25:54 -08:00