Compare commits

..

No commits in common. "master" and "2022-11-25-A" have entirely different histories.

11 changed files with 88 additions and 380 deletions

1
.gitignore vendored
View File

@ -141,4 +141,3 @@ cython_debug/
*.db
*.db-journal
*.gptc
build/

View File

@ -3,19 +3,10 @@
A [GPTC](https://git.kj7rrv.com/kj7rrv/gptc) model to classify American news as
right- or left-leaning
## Scripts
No scripts take any arguments.
* `./download.py`: download new articles and add them to the database
* `./compile.py`: compile GPTC model
* `./export.py`: create `build/` directory with files for release
* `./stats.py`: print statistics on article and source counts
Inclusion of a site in this model is not an endorsement of the site.
## Sources
Inclusion of a site in this model is not an endorsement of the site.
### Left
* ABC News
@ -23,26 +14,19 @@ Inclusion of a site in this model is not an endorsement of the site.
* CBS News
* CNBC
* CNN
* Democracy Now!
* HuffPost (formerly Huffington Post)
* The Intercept
* Los Angeles Times
* PBS NewsHour
* Slate
* The Washington Post
### Right
* The American Conservative
* American Thinker
* Breitbart
* Daily Caller
* Epoch Times
* The Federalist
* Fox News
* LifeSiteNews
* New York Post
* Not the Bee
* New York Post
* One America News Network
* RedState
* Washington Examiner

View File

@ -1,70 +0,0 @@
import gptc
amendments = [
("1st", "First"),
("2nd", "Second"),
("3rd", "Third"),
("4th", "Fourth"),
("5th", "Fifth"),
("6th", "Sixth"),
("7th", "Seventh"),
("8th", "Eighth"),
("9th", "Ninth"),
("10th", "Tenth"),
("11th", "Eleventh"),
("12th", "Twelfth"),
("13th", "Thirteenth"),
("14th", "Fourteenth"),
("15th", "Fifteenth"),
("16th", "Sixteenth"),
("17th", "Seventeenth"),
("18th", "Eighteenth"),
("19th", "Nineteenth"),
("20th", "Twentieth"),
("21st", "Twenty-first"),
("22nd", "Twenty-second"),
("23rd", "Twenty-third"),
("24th", "Twenty-fourth"),
("25th", "Twenty-fifth"),
("26th", "Twenty-sixth"),
("27th", "Twenty-seventh"),
]
with open("model.gptc", "rb") as f:
model = gptc.deserialize(f)
data = {}
for number, name in amendments:
number_data = model.get(number + " Amendment")
name_data = model.get(name + " Amendment")
if number_data and not name_data:
data[name] = number_data
elif name_data and not number_data:
data[name] = name_data
elif number_data and name_data:
data[name] = {
key: (number_data[key] + name_data[key]) / 2
for key in number_data.keys()
}
classified_amendments = sorted(data.items(), key=lambda x: x[1]["left"])
print("# Constitutional Amendment Analysis")
print()
print("""This is an analysis of which amendments to the U.S. Constitution are mentioned
more in right- or left-leaning American news sources. Data do not necessarily
correlate with support or opposition for the amendment among right- or
left-leaning Americans.""")
print()
print("| Amendment | Left | Right |")
print("+----------------+-------+-------+")
for amendment, data in classified_amendments:
percent_right = f"{data['right']*100:>4.1f}%"
percent_left = f"{data['left']*100:>4.1f}%"
amendment_padding = " "*(14 - len(amendment))
print(f"| {amendment}{amendment_padding} | {percent_left} | {percent_right} |")
print("+----------------+-------+-------+")
print("| Amendment | Left | Right |")

View File

@ -1,85 +0,0 @@
import gptc
states = [
"Alabama",
"Alaska",
"Arizona",
"Arkansas",
"California",
"Colorado",
"Connecticut",
"Delaware",
"Florida",
"Georgia",
"Hawaii",
"Idaho",
"Illinois",
"Indiana",
"Iowa",
"Kansas",
"Kentucky",
"Louisiana",
"Maine",
"Maryland",
"Massachusetts",
"Michigan",
"Minnesota",
"Mississippi",
"Missouri",
"Montana",
"Nebraska",
"Nevada",
"New Hampshire",
"New Jersey",
"New Mexico",
"New York",
"North Carolina",
"North Dakota",
"Ohio",
"Oklahoma",
"Oregon",
"Pennsylvania",
"Rhode Island",
"South Carolina",
"South Dakota",
"Tennessee",
"Texas",
"Utah",
"Vermont",
"Virginia",
"Washington",
"West Virginia",
"Wisconsin",
"Wyoming",
]
with open("model.gptc", "rb") as f:
model = gptc.deserialize(f)
classified_states = []
for state in states:
classified_states.append((state, model.get(state),))
classified_states.sort(key=lambda x: x[1]["left"])
longest = max([len(state) for state in states])
print("# State Analysis")
print()
print("""This is an analysis of which states are mentioned more in right- or left-
leaning American news sources. Results do not necessarily correlate with the
political views of residents of the states; for example, the predominantly
liberal state of Oregon is mentioned more in right-leaning sources than in
left-leaning ones.""")
print()
print("| State | Left | Right |")
print("+----------------+-------+-------+")
for state, data in classified_states:
percent_right = f"{round(data['right']*1000)/10}%"
percent_left = f"{round(data['left']*1000)/10}%"
state_padding = " "*(longest - len(state))
print(f"| {state}{state_padding} | {percent_left} | {percent_right} |")
print("+----------------+-------+-------+")
print("| State | Left | Right |")

30
compile.py Executable file → Normal file
View File

@ -1,28 +1,6 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import sqlite3
import tomli
import gptc
with open("compiler.toml", "rb") as f:
config = tomli.load(f)
con = sqlite3.connect("articles.db")
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
@ -32,11 +10,9 @@ raw_model = [
]
with open("model.gptc", "w+b") as f:
gptc.compile(
raw_model,
max_ngram_length=config["max_ngram_length"],
min_count=config["min_count"],
).serialize(f)
f.write(
gptc.compile(raw_model, max_ngram_length=3, min_count=3).serialize()
)
con.commit()
con.close()

View File

@ -1,2 +0,0 @@
max_ngram_length=5
min_count=5

18
download.py Executable file → Normal file
View File

@ -1,21 +1,3 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import feedparser
import sqlite3
import goose3

View File

@ -1,39 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import os
import shutil
import sqlite3
try:
shutil.rmtree("build")
except FileNotFoundError:
pass
os.mkdir("build")
shutil.copy("articles.db", "build/articles.db")
shutil.copy("sources.toml", "build/sources.toml")
shutil.copy("compiler.toml", "build/compiler.toml")
shutil.copy("model.gptc", "build/model.gptc")
con = sqlite3.Connection("build/articles.db")
con.execute("UPDATE articles SET text = '***';")
con.commit()
con.execute("VACUUM;")
con.commit()
con.close()

8
export.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/sh
cp articles.db old_articles.db
sqlite3 articles.db 'UPDATE articles SET text = "***";'
sqlite3 articles.db 'VACUUM;'
echo -n "Press enter when done..."
read
rm articles.db
mv old_articles.db articles.db

View File

@ -5,11 +5,41 @@
# ? Newsmax (read timeout errors)
# ? Bloomberg (CAPTCHA on RSS feed?)
[abc_news]
feed="https://abcnews.go.com/abcnews/usheadlines"
[cnn]
feed="http://rss.cnn.com/rss/cnn_latest.rss"
category="left"
contains_articles=false
name="ABC News"
name="CNN"
[huffpost]
feed="https://chaski.huffpost.com/us/auto"
category="left"
contains_articles=false
name="HuffPost"
[cnbc]
feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
category="left"
contains_articles=false
name="CNBC"
[pbs_newshour]
feed="https://www.pbs.org/newshour/feeds/rss/headlines"
category="left"
contains_articles=false
name="PBS NewsHour"
[latimes]
feed="https://www.latimes.com/local/rss2.0.xml"
category="left"
contains_articles=false
name="Los Angeles Times"
[cbs_news]
feed="https://www.cbsnews.com/latest/rss/main"
category="left"
contains_articles=false
name="CBS News"
[atlantic]
feed="https://www.theatlantic.com/feed/all/"
@ -18,35 +48,11 @@ contains_articles=true
name="The Atlantic"
sort="Atlantic"
[cbs_news]
feed="https://www.cbsnews.com/latest/rss/main"
[abc_news]
feed="https://abcnews.go.com/abcnews/usheadlines"
category="left"
contains_articles=false
name="CBS News"
[cnbc]
feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
category="left"
contains_articles=false
name="CNBC"
[cnn]
feed="http://rss.cnn.com/rss/cnn_latest.rss"
category="left"
contains_articles=false
name="CNN"
[democracy_now]
feed="https://www.democracynow.org/democracynow.rss"
category="left"
contains_articles=false
name="Democracy Now!"
[huffpost]
feed="https://chaski.huffpost.com/us/auto"
category="left"
contains_articles=false
name="HuffPost"
name="ABC News"
[intercept]
feed="https://theintercept.com/feed/?lang=en"
@ -55,42 +61,45 @@ contains_articles=true
name="The Intercept"
sort="Intercept"
[latimes]
feed="https://www.latimes.com/local/rss2.0.xml"
category="left"
contains_articles=false
name="Los Angeles Times"
[pbs_newshour]
feed="https://www.pbs.org/newshour/feeds/rss/headlines"
category="left"
contains_articles=false
name="PBS NewsHour"
[slate]
feed="http://www.slate.com/articles/news_and_politics.fulltext.all.10.rss"
category="left"
contains_articles=false
name="Slate"
[washington_post]
feed="https://feeds.washingtonpost.com/rss/national"
category="left"
contains_articles=false
name="The Washington Post"
sort="Washington Post"
#[bloomberg]
#feed="https://www.bloomberg.com/politics/feeds/site.xml"
#category="left"
#contains_articles=false
[american_conservative]
feed="https://theamericanconservative.com/articles/feed/"
[fox_news]
feed="https://moxie.foxnews.com/google-publisher/latest.xml"
category="right"
contains_articles=true
name="The American Conservative"
sort="American Conservative"
name="Fox News"
[oann]
feed="https://www.oann.com/category/newsroom/feed"
category="right"
contains_articles=true
name="One America News Network"
[nypost]
feed="https://nypost.com/feed"
category="right"
contains_articles=false
exclude=[
{ type="startswith", pattern="https://pagesix.com" },
{ type="startswith", pattern="https://decider.com" },
]
name="New York Post"
[federalist]
feed="https://thefederalist.com/feed"
category="right"
contains_articles=false
name="The Federalist"
sort="Federalist"
[washington_examiner]
feed="https://feeds.feedburner.com/dcexaminer/Politics"
category="right"
contains_articles=true
name="Washington Examiner"
[american_thinker]
feed="https://feeds.feedburner.com/americanthinker_articles"
@ -104,66 +113,30 @@ category="right"
contains_articles=false
name="Breitbart"
[daily_caller]
feed="https://feeds.feedburner.com/dailycaller"
category="right"
contains_articles=false
name="Daily Caller"
[epoch_times]
feed="https://www.theepochtimes.com/feed/"
category="right"
contains_articles=false
name="Epoch Times"
[federalist]
feed="https://thefederalist.com/feed"
category="right"
contains_articles=false
name="The Federalist"
sort="Federalist"
[fox_news]
feed="https://moxie.foxnews.com/google-publisher/latest.xml"
category="right"
contains_articles=true
name="Fox News"
[lifesitenews]
feed="https://www.lifesitenews.com/ldn/rss/headlines.xml"
category="right"
contains_articles=false
name="LifeSiteNews"
[not_the_bee]
feed="https://notthebee.com/feed"
category="right"
contains_articles=false
name="Not the Bee"
[nypost]
feed="https://nypost.com/news/feed"
[daily_caller]
feed="https://feeds.feedburner.com/dailycaller"
category="right"
contains_articles=false
name="New York Post"
name="Daily Caller"
[oann]
feed="https://www.oann.com/category/newsroom/feed"
[american_conservative]
feed="https://theamericanconservative.com/articles/feed/"
category="right"
contains_articles=true
name="One America News Network"
[redstate]
feed="https://redstate.com/feed"
category="right"
contains_articles=false
name="RedState"
[washington_examiner]
feed="https://feeds.feedburner.com/dcexaminer/Politics"
category="right"
contains_articles=true
name="Washington Examiner"
name="The American Conservative"
sort="American Conservative"
#[newsmax]
#feed="https://www.newsmax.com/rss/Newsfront/16/"

18
stats.py Executable file → Normal file
View File

@ -1,21 +1,3 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import sqlite3
import tomli