Compare commits

..

No commits in common. "master" and "2022-11-25-A" have entirely different histories.

11 changed files with 88 additions and 380 deletions

1
.gitignore vendored
View File

@ -141,4 +141,3 @@ cython_debug/
*.db *.db
*.db-journal *.db-journal
*.gptc *.gptc
build/

View File

@ -3,19 +3,10 @@
A [GPTC](https://git.kj7rrv.com/kj7rrv/gptc) model to classify American news as A [GPTC](https://git.kj7rrv.com/kj7rrv/gptc) model to classify American news as
right- or left-leaning right- or left-leaning
## Scripts Inclusion of a site in this model is not an endorsement of the site.
No scripts take any arguments.
* `./download.py`: download new articles and add them to the database
* `./compile.py`: compile GPTC model
* `./export.py`: create `build/` directory with files for release
* `./stats.py`: print statistics on article and source counts
## Sources ## Sources
Inclusion of a site in this model is not an endorsement of the site.
### Left ### Left
* ABC News * ABC News
@ -23,26 +14,19 @@ Inclusion of a site in this model is not an endorsement of the site.
* CBS News * CBS News
* CNBC * CNBC
* CNN * CNN
* Democracy Now!
* HuffPost (formerly Huffington Post) * HuffPost (formerly Huffington Post)
* The Intercept
* Los Angeles Times * Los Angeles Times
* PBS NewsHour * PBS NewsHour
* Slate
* The Washington Post
### Right ### Right
* The American Conservative
* American Thinker * American Thinker
* Breitbart * Breitbart
* Daily Caller * Daily Caller
* Epoch Times * Epoch Times
* The Federalist * The Federalist
* Fox News * Fox News
* LifeSiteNews
* New York Post
* Not the Bee * Not the Bee
* New York Post
* One America News Network * One America News Network
* RedState
* Washington Examiner * Washington Examiner

View File

@ -1,70 +0,0 @@
import gptc
amendments = [
("1st", "First"),
("2nd", "Second"),
("3rd", "Third"),
("4th", "Fourth"),
("5th", "Fifth"),
("6th", "Sixth"),
("7th", "Seventh"),
("8th", "Eighth"),
("9th", "Ninth"),
("10th", "Tenth"),
("11th", "Eleventh"),
("12th", "Twelfth"),
("13th", "Thirteenth"),
("14th", "Fourteenth"),
("15th", "Fifteenth"),
("16th", "Sixteenth"),
("17th", "Seventeenth"),
("18th", "Eighteenth"),
("19th", "Nineteenth"),
("20th", "Twentieth"),
("21st", "Twenty-first"),
("22nd", "Twenty-second"),
("23rd", "Twenty-third"),
("24th", "Twenty-fourth"),
("25th", "Twenty-fifth"),
("26th", "Twenty-sixth"),
("27th", "Twenty-seventh"),
]
with open("model.gptc", "rb") as f:
model = gptc.deserialize(f)
data = {}
for number, name in amendments:
number_data = model.get(number + " Amendment")
name_data = model.get(name + " Amendment")
if number_data and not name_data:
data[name] = number_data
elif name_data and not number_data:
data[name] = name_data
elif number_data and name_data:
data[name] = {
key: (number_data[key] + name_data[key]) / 2
for key in number_data.keys()
}
classified_amendments = sorted(data.items(), key=lambda x: x[1]["left"])
print("# Constitutional Amendment Analysis")
print()
print("""This is an analysis of which amendments to the U.S. Constitution are mentioned
more in right- or left-leaning American news sources. Data do not necessarily
correlate with support or opposition for the amendment among right- or
left-leaning Americans.""")
print()
print("| Amendment | Left | Right |")
print("+----------------+-------+-------+")
for amendment, data in classified_amendments:
percent_right = f"{data['right']*100:>4.1f}%"
percent_left = f"{data['left']*100:>4.1f}%"
amendment_padding = " "*(14 - len(amendment))
print(f"| {amendment}{amendment_padding} | {percent_left} | {percent_right} |")
print("+----------------+-------+-------+")
print("| Amendment | Left | Right |")

View File

@ -1,85 +0,0 @@
import gptc
states = [
"Alabama",
"Alaska",
"Arizona",
"Arkansas",
"California",
"Colorado",
"Connecticut",
"Delaware",
"Florida",
"Georgia",
"Hawaii",
"Idaho",
"Illinois",
"Indiana",
"Iowa",
"Kansas",
"Kentucky",
"Louisiana",
"Maine",
"Maryland",
"Massachusetts",
"Michigan",
"Minnesota",
"Mississippi",
"Missouri",
"Montana",
"Nebraska",
"Nevada",
"New Hampshire",
"New Jersey",
"New Mexico",
"New York",
"North Carolina",
"North Dakota",
"Ohio",
"Oklahoma",
"Oregon",
"Pennsylvania",
"Rhode Island",
"South Carolina",
"South Dakota",
"Tennessee",
"Texas",
"Utah",
"Vermont",
"Virginia",
"Washington",
"West Virginia",
"Wisconsin",
"Wyoming",
]
with open("model.gptc", "rb") as f:
model = gptc.deserialize(f)
classified_states = []
for state in states:
classified_states.append((state, model.get(state),))
classified_states.sort(key=lambda x: x[1]["left"])
longest = max([len(state) for state in states])
print("# State Analysis")
print()
print("""This is an analysis of which states are mentioned more in right- or left-
leaning American news sources. Results do not necessarily correlate with the
political views of residents of the states; for example, the predominantly
liberal state of Oregon is mentioned more in right-leaning sources than in
left-leaning ones.""")
print()
print("| State | Left | Right |")
print("+----------------+-------+-------+")
for state, data in classified_states:
percent_right = f"{round(data['right']*1000)/10}%"
percent_left = f"{round(data['left']*1000)/10}%"
state_padding = " "*(longest - len(state))
print(f"| {state}{state_padding} | {percent_left} | {percent_right} |")
print("+----------------+-------+-------+")
print("| State | Left | Right |")

30
compile.py Executable file → Normal file
View File

@ -1,28 +1,6 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import sqlite3 import sqlite3
import tomli
import gptc import gptc
with open("compiler.toml", "rb") as f:
config = tomli.load(f)
con = sqlite3.connect("articles.db") con = sqlite3.connect("articles.db")
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);") con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
@ -32,11 +10,9 @@ raw_model = [
] ]
with open("model.gptc", "w+b") as f: with open("model.gptc", "w+b") as f:
gptc.compile( f.write(
raw_model, gptc.compile(raw_model, max_ngram_length=3, min_count=3).serialize()
max_ngram_length=config["max_ngram_length"], )
min_count=config["min_count"],
).serialize(f)
con.commit() con.commit()
con.close() con.close()

View File

@ -1,2 +0,0 @@
max_ngram_length=5
min_count=5

18
download.py Executable file → Normal file
View File

@ -1,21 +1,3 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import feedparser import feedparser
import sqlite3 import sqlite3
import goose3 import goose3

View File

@ -1,39 +0,0 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import os
import shutil
import sqlite3
try:
shutil.rmtree("build")
except FileNotFoundError:
pass
os.mkdir("build")
shutil.copy("articles.db", "build/articles.db")
shutil.copy("sources.toml", "build/sources.toml")
shutil.copy("compiler.toml", "build/compiler.toml")
shutil.copy("model.gptc", "build/model.gptc")
con = sqlite3.Connection("build/articles.db")
con.execute("UPDATE articles SET text = '***';")
con.commit()
con.execute("VACUUM;")
con.commit()
con.close()

8
export.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/sh
cp articles.db old_articles.db
sqlite3 articles.db 'UPDATE articles SET text = "***";'
sqlite3 articles.db 'VACUUM;'
echo -n "Press enter when done..."
read
rm articles.db
mv old_articles.db articles.db

View File

@ -5,11 +5,41 @@
# ? Newsmax (read timeout errors) # ? Newsmax (read timeout errors)
# ? Bloomberg (CAPTCHA on RSS feed?) # ? Bloomberg (CAPTCHA on RSS feed?)
[abc_news] [cnn]
feed="https://abcnews.go.com/abcnews/usheadlines" feed="http://rss.cnn.com/rss/cnn_latest.rss"
category="left" category="left"
contains_articles=false contains_articles=false
name="ABC News" name="CNN"
[huffpost]
feed="https://chaski.huffpost.com/us/auto"
category="left"
contains_articles=false
name="HuffPost"
[cnbc]
feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
category="left"
contains_articles=false
name="CNBC"
[pbs_newshour]
feed="https://www.pbs.org/newshour/feeds/rss/headlines"
category="left"
contains_articles=false
name="PBS NewsHour"
[latimes]
feed="https://www.latimes.com/local/rss2.0.xml"
category="left"
contains_articles=false
name="Los Angeles Times"
[cbs_news]
feed="https://www.cbsnews.com/latest/rss/main"
category="left"
contains_articles=false
name="CBS News"
[atlantic] [atlantic]
feed="https://www.theatlantic.com/feed/all/" feed="https://www.theatlantic.com/feed/all/"
@ -18,35 +48,11 @@ contains_articles=true
name="The Atlantic" name="The Atlantic"
sort="Atlantic" sort="Atlantic"
[cbs_news] [abc_news]
feed="https://www.cbsnews.com/latest/rss/main" feed="https://abcnews.go.com/abcnews/usheadlines"
category="left" category="left"
contains_articles=false contains_articles=false
name="CBS News" name="ABC News"
[cnbc]
feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
category="left"
contains_articles=false
name="CNBC"
[cnn]
feed="http://rss.cnn.com/rss/cnn_latest.rss"
category="left"
contains_articles=false
name="CNN"
[democracy_now]
feed="https://www.democracynow.org/democracynow.rss"
category="left"
contains_articles=false
name="Democracy Now!"
[huffpost]
feed="https://chaski.huffpost.com/us/auto"
category="left"
contains_articles=false
name="HuffPost"
[intercept] [intercept]
feed="https://theintercept.com/feed/?lang=en" feed="https://theintercept.com/feed/?lang=en"
@ -55,42 +61,45 @@ contains_articles=true
name="The Intercept" name="The Intercept"
sort="Intercept" sort="Intercept"
[latimes]
feed="https://www.latimes.com/local/rss2.0.xml"
category="left"
contains_articles=false
name="Los Angeles Times"
[pbs_newshour]
feed="https://www.pbs.org/newshour/feeds/rss/headlines"
category="left"
contains_articles=false
name="PBS NewsHour"
[slate]
feed="http://www.slate.com/articles/news_and_politics.fulltext.all.10.rss"
category="left"
contains_articles=false
name="Slate"
[washington_post]
feed="https://feeds.washingtonpost.com/rss/national"
category="left"
contains_articles=false
name="The Washington Post"
sort="Washington Post"
#[bloomberg] #[bloomberg]
#feed="https://www.bloomberg.com/politics/feeds/site.xml" #feed="https://www.bloomberg.com/politics/feeds/site.xml"
#category="left" #category="left"
#contains_articles=false #contains_articles=false
[american_conservative] [fox_news]
feed="https://theamericanconservative.com/articles/feed/" feed="https://moxie.foxnews.com/google-publisher/latest.xml"
category="right" category="right"
contains_articles=true contains_articles=true
name="The American Conservative" name="Fox News"
sort="American Conservative"
[oann]
feed="https://www.oann.com/category/newsroom/feed"
category="right"
contains_articles=true
name="One America News Network"
[nypost]
feed="https://nypost.com/feed"
category="right"
contains_articles=false
exclude=[
{ type="startswith", pattern="https://pagesix.com" },
{ type="startswith", pattern="https://decider.com" },
]
name="New York Post"
[federalist]
feed="https://thefederalist.com/feed"
category="right"
contains_articles=false
name="The Federalist"
sort="Federalist"
[washington_examiner]
feed="https://feeds.feedburner.com/dcexaminer/Politics"
category="right"
contains_articles=true
name="Washington Examiner"
[american_thinker] [american_thinker]
feed="https://feeds.feedburner.com/americanthinker_articles" feed="https://feeds.feedburner.com/americanthinker_articles"
@ -104,66 +113,30 @@ category="right"
contains_articles=false contains_articles=false
name="Breitbart" name="Breitbart"
[daily_caller]
feed="https://feeds.feedburner.com/dailycaller"
category="right"
contains_articles=false
name="Daily Caller"
[epoch_times] [epoch_times]
feed="https://www.theepochtimes.com/feed/" feed="https://www.theepochtimes.com/feed/"
category="right" category="right"
contains_articles=false contains_articles=false
name="Epoch Times" name="Epoch Times"
[federalist]
feed="https://thefederalist.com/feed"
category="right"
contains_articles=false
name="The Federalist"
sort="Federalist"
[fox_news]
feed="https://moxie.foxnews.com/google-publisher/latest.xml"
category="right"
contains_articles=true
name="Fox News"
[lifesitenews]
feed="https://www.lifesitenews.com/ldn/rss/headlines.xml"
category="right"
contains_articles=false
name="LifeSiteNews"
[not_the_bee] [not_the_bee]
feed="https://notthebee.com/feed" feed="https://notthebee.com/feed"
category="right" category="right"
contains_articles=false contains_articles=false
name="Not the Bee" name="Not the Bee"
[nypost] [daily_caller]
feed="https://nypost.com/news/feed" feed="https://feeds.feedburner.com/dailycaller"
category="right" category="right"
contains_articles=false contains_articles=false
name="New York Post" name="Daily Caller"
[oann] [american_conservative]
feed="https://www.oann.com/category/newsroom/feed" feed="https://theamericanconservative.com/articles/feed/"
category="right" category="right"
contains_articles=true contains_articles=true
name="One America News Network" name="The American Conservative"
sort="American Conservative"
[redstate]
feed="https://redstate.com/feed"
category="right"
contains_articles=false
name="RedState"
[washington_examiner]
feed="https://feeds.feedburner.com/dcexaminer/Politics"
category="right"
contains_articles=true
name="Washington Examiner"
#[newsmax] #[newsmax]
#feed="https://www.newsmax.com/rss/Newsfront/16/" #feed="https://www.newsmax.com/rss/Newsfront/16/"

18
stats.py Executable file → Normal file
View File

@ -1,21 +1,3 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import sqlite3 import sqlite3
import tomli import tomli