Compare commits

...

32 Commits

Author SHA1 Message Date
9c66b18cfe
Export compiler.toml 2023-01-08 08:48:20 -08:00
fe088822e1
Decrease ngram length to 5 2023-01-08 08:47:59 -08:00
1daab919ea
Add analyses 2023-01-08 08:45:22 -08:00
4eeb8d2d17
Use GPTC v4.0.0 2022-12-24 12:29:00 -08:00
f1ccaaabab
Add LifeSiteNews 2022-12-20 17:50:21 -08:00
9d82b07f17
Add RedState 2022-12-20 17:31:12 -08:00
f25594d771
Reduce model size 2022-11-28 18:00:31 -08:00
af9e5e92a3
Use 10-grams 2022-11-28 17:40:10 -08:00
b485780738
Fix sources list in README 2022-11-26 13:45:41 -08:00
82846f39ba
Add Democracy Now! 2022-11-26 13:45:34 -08:00
5f3a2977f1
Add Slate 2022-11-26 13:05:45 -08:00
314bdef1c5
Compiler settings 2022-11-26 13:05:37 -08:00
28f81c9a63
Change minimum use count to 5 2022-11-26 10:51:14 -08:00
06190f5101
Add Washington Post 2022-11-26 09:41:42 -08:00
68c8949005
Sort sources in config file 2022-11-25 17:04:17 -08:00
29d77b5393
Only get news from New York Post 2022-11-25 12:48:46 -08:00
9f3abd8641
Add license headers to code 2022-11-25 09:35:27 -08:00
17bb8a4f3f
Document scripts 2022-11-25 09:33:02 -08:00
f0b93cd2f6
Make scripts executable 2022-11-25 09:29:39 -08:00
aa8fa31195
Rewrite export script in Python; use build dir 2022-11-25 09:25:54 -08:00
e5e046b70d
Add sources 2022-11-25 09:12:45 -08:00
54d97a3a16
Remove extra newline from stats 2022-11-24 21:31:36 -08:00
487b087910
Rewrite stats script in Python 2022-11-24 21:27:00 -08:00
43faa6139a
Format download.py 2022-11-24 20:57:14 -08:00
5f7fd0ccb5
Add names and sort keys 2022-11-24 20:56:25 -08:00
af5f3c3df1
Split download into download and compile 2022-11-24 20:46:05 -08:00
a96d474e37
Update gitignore 2022-11-24 20:23:43 -08:00
ee8189d476
Update stats script 2022-11-24 19:48:54 -08:00
1d2cfab68c
More sources 2022-11-24 19:41:40 -08:00
c5cc6d78f9
More sources 2022-11-24 15:57:56 -08:00
e206210ec5
Disclaimer 2022-11-24 13:45:28 -08:00
7bf17b150e
Add ability to exclude articles by URL 2022-11-24 12:23:36 -08:00
12 changed files with 533 additions and 89 deletions

2
.gitignore vendored
View File

@ -139,4 +139,6 @@ cython_debug/
# Model
*.db
*.db-journal
*.gptc
build/

View File

@ -3,19 +3,46 @@
A [GPTC](https://git.kj7rrv.com/kj7rrv/gptc) model to classify American news as
right- or left-leaning
## Scripts
No scripts take any arguments.
* `./download.py`: download new articles and add them to the database
* `./compile.py`: compile GPTC model
* `./export.py`: create `build/` directory with files for release
* `./stats.py`: print statistics on article and source counts
## Sources
Inclusion of a site in this model is not an endorsement of the site.
### Left
* CNN
* HuffPost (formerly Huffington Post)
* ABC News
* The Atlantic
* CBS News
* CNBC
* PBS NewsHour
* CNN
* Democracy Now!
* HuffPost (formerly Huffington Post)
* The Intercept
* Los Angeles Times
* PBS NewsHour
* Slate
* The Washington Post
### Right
* Fox News
* One America News Network
* New York Post
* The American Conservative
* American Thinker
* Breitbart
* Daily Caller
* Epoch Times
* The Federalist
* Fox News
* LifeSiteNews
* New York Post
* Not the Bee
* One America News Network
* RedState
* Washington Examiner

View File

@ -0,0 +1,70 @@
import gptc
amendments = [
("1st", "First"),
("2nd", "Second"),
("3rd", "Third"),
("4th", "Fourth"),
("5th", "Fifth"),
("6th", "Sixth"),
("7th", "Seventh"),
("8th", "Eighth"),
("9th", "Ninth"),
("10th", "Tenth"),
("11th", "Eleventh"),
("12th", "Twelfth"),
("13th", "Thirteenth"),
("14th", "Fourteenth"),
("15th", "Fifteenth"),
("16th", "Sixteenth"),
("17th", "Seventeenth"),
("18th", "Eighteenth"),
("19th", "Nineteenth"),
("20th", "Twentieth"),
("21st", "Twenty-first"),
("22nd", "Twenty-second"),
("23rd", "Twenty-third"),
("24th", "Twenty-fourth"),
("25th", "Twenty-fifth"),
("26th", "Twenty-sixth"),
("27th", "Twenty-seventh"),
]
with open("model.gptc", "rb") as f:
model = gptc.deserialize(f)
data = {}
for number, name in amendments:
number_data = model.get(number + " Amendment")
name_data = model.get(name + " Amendment")
if number_data and not name_data:
data[name] = number_data
elif name_data and not number_data:
data[name] = name_data
elif number_data and name_data:
data[name] = {
key: (number_data[key] + name_data[key]) / 2
for key in number_data.keys()
}
classified_amendments = sorted(data.items(), key=lambda x: x[1]["left"])
print("# Constitutional Amendment Analysis")
print()
print("""This is an analysis of which amendments to the U.S. Constitution are mentioned
more in right- or left-leaning American news sources. Data do not necessarily
correlate with support or opposition for the amendment among right- or
left-leaning Americans.""")
print()
print("| Amendment | Left | Right |")
print("+----------------+-------+-------+")
for amendment, data in classified_amendments:
percent_right = f"{data['right']*100:>4.1f}%"
percent_left = f"{data['left']*100:>4.1f}%"
amendment_padding = " "*(14 - len(amendment))
print(f"| {amendment}{amendment_padding} | {percent_left} | {percent_right} |")
print("+----------------+-------+-------+")
print("| Amendment | Left | Right |")

85
analyses/states.py Normal file
View File

@ -0,0 +1,85 @@
import gptc
states = [
"Alabama",
"Alaska",
"Arizona",
"Arkansas",
"California",
"Colorado",
"Connecticut",
"Delaware",
"Florida",
"Georgia",
"Hawaii",
"Idaho",
"Illinois",
"Indiana",
"Iowa",
"Kansas",
"Kentucky",
"Louisiana",
"Maine",
"Maryland",
"Massachusetts",
"Michigan",
"Minnesota",
"Mississippi",
"Missouri",
"Montana",
"Nebraska",
"Nevada",
"New Hampshire",
"New Jersey",
"New Mexico",
"New York",
"North Carolina",
"North Dakota",
"Ohio",
"Oklahoma",
"Oregon",
"Pennsylvania",
"Rhode Island",
"South Carolina",
"South Dakota",
"Tennessee",
"Texas",
"Utah",
"Vermont",
"Virginia",
"Washington",
"West Virginia",
"Wisconsin",
"Wyoming",
]
with open("model.gptc", "rb") as f:
model = gptc.deserialize(f)
classified_states = []
for state in states:
classified_states.append((state, model.get(state),))
classified_states.sort(key=lambda x: x[1]["left"])
longest = max([len(state) for state in states])
print("# State Analysis")
print()
print("""This is an analysis of which states are mentioned more in right- or left-
leaning American news sources. Results do not necessarily correlate with the
political views of residents of the states; for example, the predominantly
liberal state of Oregon is mentioned more in right-leaning sources than in
left-leaning ones.""")
print()
print("| State | Left | Right |")
print("+----------------+-------+-------+")
for state, data in classified_states:
percent_right = f"{round(data['right']*1000)/10}%"
percent_left = f"{round(data['left']*1000)/10}%"
state_padding = " "*(longest - len(state))
print(f"| {state}{state_padding} | {percent_left} | {percent_right} |")
print("+----------------+-------+-------+")
print("| State | Left | Right |")

42
compile.py Executable file
View File

@ -0,0 +1,42 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import sqlite3
import tomli
import gptc
with open("compiler.toml", "rb") as f:
config = tomli.load(f)
con = sqlite3.connect("articles.db")
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
raw_model = [
{"text": i[0], "category": i[1]}
for i in con.execute("SELECT text, category FROM articles;")
]
with open("model.gptc", "w+b") as f:
gptc.compile(
raw_model,
max_ngram_length=config["max_ngram_length"],
min_count=config["min_count"],
).serialize(f)
con.commit()
con.close()

2
compiler.toml Normal file
View File

@ -0,0 +1,2 @@
max_ngram_length=5
min_count=5

96
download.py Normal file → Executable file
View File

@ -1,11 +1,37 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import feedparser
import hashlib
import sqlite3
import goose3
import tomli
import gptc
import bs4
def matches(string, checks):
for check in checks:
if check["type"] == "startswith" and string.startswith(
check["pattern"]
):
return True
return False
with open("sources.toml", "rb") as f:
sources = tomli.load(f)
@ -39,49 +65,43 @@ try:
entry
for entry in feedparser.parse(url)["entries"]
if not entry["link"] in known
and not matches(entry["link"], config.get("exclude", []))
]
print(f"Fetched feed. Found {len(entries)} new articles.")
if contains_articles:
for entry in entries:
print(f"Saving {entry['link']}")
con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);",
(
name,
category,
entry["link"],
bs4.BeautifulSoup(
entry["content"][0]["value"], features="lxml"
).text,
),
)
try:
print(f"Saving {entry['link']}")
con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);",
(
name,
category,
entry["link"],
bs4.BeautifulSoup(
entry["content"][0]["value"], features="lxml"
).text,
),
)
except KeyError:
print("Not enough information. Skipping.")
else:
for entry in entries:
print(f"Downloading {entry['link']}...")
con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);",
(
name,
category,
entry["link"],
g.extract(entry["link"]).cleaned_text,
),
)
print(f"Done downloading.")
try:
print(f"Downloading {entry['link']}...")
con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);",
(
name,
category,
entry["link"],
g.extract(entry["link"]).cleaned_text,
),
)
print(f"Done downloading.")
except KeyError:
print("Not enough information. Skipping.")
finally:
con.commit()
print("Compiling model...")
raw_model = [
{"text": i[0], "category": i[1]}
for i in con.execute("SELECT text, category FROM articles;")
]
with open("model.gptc", "w+b") as f:
f.write(
gptc.compile(raw_model, max_ngram_length=3, min_count=3).serialize()
)
con.close()

39
export.py Executable file
View File

@ -0,0 +1,39 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import os
import shutil
import sqlite3
try:
shutil.rmtree("build")
except FileNotFoundError:
pass
os.mkdir("build")
shutil.copy("articles.db", "build/articles.db")
shutil.copy("sources.toml", "build/sources.toml")
shutil.copy("compiler.toml", "build/compiler.toml")
shutil.copy("model.gptc", "build/model.gptc")
con = sqlite3.Connection("build/articles.db")
con.execute("UPDATE articles SET text = '***';")
con.commit()
con.execute("VACUUM;")
con.commit()
con.close()

View File

@ -1,8 +0,0 @@
#!/bin/sh
cp articles.db old_articles.db
sqlite3 articles.db 'UPDATE articles SET text = "***";'
sqlite3 articles.db 'VACUUM;'
echo -n "Press enter when done..."
read
rm articles.db
mv old_articles.db articles.db

View File

@ -5,55 +5,165 @@
# ? Newsmax (read timeout errors)
# ? Bloomberg (CAPTCHA on RSS feed?)
[cnn]
feed="http://rss.cnn.com/rss/cnn_latest.rss"
[abc_news]
feed="https://abcnews.go.com/abcnews/usheadlines"
category="left"
contains_articles=false
name="ABC News"
[huffpost]
feed="https://chaski.huffpost.com/us/auto"
[atlantic]
feed="https://www.theatlantic.com/feed/all/"
category="left"
contains_articles=true
name="The Atlantic"
sort="Atlantic"
[cbs_news]
feed="https://www.cbsnews.com/latest/rss/main"
category="left"
contains_articles=false
name="CBS News"
[cnbc]
feed="https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100003114"
category="left"
contains_articles=false
name="CNBC"
[pbs_newshour]
feed="https://www.pbs.org/newshour/feeds/rss/headlines"
[cnn]
feed="http://rss.cnn.com/rss/cnn_latest.rss"
category="left"
contains_articles=false
name="CNN"
[democracy_now]
feed="https://www.democracynow.org/democracynow.rss"
category="left"
contains_articles=false
name="Democracy Now!"
[huffpost]
feed="https://chaski.huffpost.com/us/auto"
category="left"
contains_articles=false
name="HuffPost"
[intercept]
feed="https://theintercept.com/feed/?lang=en"
category="left"
contains_articles=true
name="The Intercept"
sort="Intercept"
[latimes]
feed="https://www.latimes.com/local/rss2.0.xml"
category="left"
contains_articles=false
name="Los Angeles Times"
[pbs_newshour]
feed="https://www.pbs.org/newshour/feeds/rss/headlines"
category="left"
contains_articles=false
name="PBS NewsHour"
[slate]
feed="http://www.slate.com/articles/news_and_politics.fulltext.all.10.rss"
category="left"
contains_articles=false
name="Slate"
[washington_post]
feed="https://feeds.washingtonpost.com/rss/national"
category="left"
contains_articles=false
name="The Washington Post"
sort="Washington Post"
#[bloomberg]
#feed="https://www.bloomberg.com/politics/feeds/site.xml"
#category="left"
#contains_articles=false
[fox]
feed="https://moxie.foxnews.com/google-publisher/latest.xml"
[american_conservative]
feed="https://theamericanconservative.com/articles/feed/"
category="right"
contains_articles=true
name="The American Conservative"
sort="American Conservative"
[oann]
feed="https://www.oann.com/category/newsroom/feed"
category="right"
contains_articles=true
[nypost]
feed="https://nypost.com/feed"
[american_thinker]
feed="https://feeds.feedburner.com/americanthinker_articles"
category="right"
contains_articles=false
name="American Thinker"
[breitbart]
feed="https://feeds.feedburner.com/breitbart/"
category="right"
contains_articles=false
name="Breitbart"
[daily_caller]
feed="https://feeds.feedburner.com/dailycaller"
category="right"
contains_articles=false
name="Daily Caller"
[epoch_times]
feed="https://www.theepochtimes.com/feed/"
category="right"
contains_articles=false
name="Epoch Times"
[federalist]
feed="https://thefederalist.com/feed"
category="right"
contains_articles=false
name="The Federalist"
sort="Federalist"
[fox_news]
feed="https://moxie.foxnews.com/google-publisher/latest.xml"
category="right"
contains_articles=true
name="Fox News"
[lifesitenews]
feed="https://www.lifesitenews.com/ldn/rss/headlines.xml"
category="right"
contains_articles=false
name="LifeSiteNews"
[not_the_bee]
feed="https://notthebee.com/feed"
category="right"
contains_articles=false
name="Not the Bee"
[nypost]
feed="https://nypost.com/news/feed"
category="right"
contains_articles=false
name="New York Post"
[oann]
feed="https://www.oann.com/category/newsroom/feed"
category="right"
contains_articles=true
name="One America News Network"
[redstate]
feed="https://redstate.com/feed"
category="right"
contains_articles=false
name="RedState"
[washington_examiner]
feed="https://feeds.feedburner.com/dcexaminer/Politics"
category="right"
contains_articles=true
name="Washington Examiner"
#[newsmax]
#feed="https://www.newsmax.com/rss/Newsfront/16/"

77
stats.py Executable file
View File

@ -0,0 +1,77 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (c) 2022 Samuel L Sloniker
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
import sqlite3
import tomli
with open("sources.toml", "rb") as f:
sources = tomli.load(f)
con = sqlite3.connect("articles.db")
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
article_count = len(list(con.execute("SELECT url FROM articles")))
left_article_count = len(list(con.execute("SELECT url FROM articles WHERE category = 'left'")))
right_article_count = len(list(con.execute("SELECT url FROM articles WHERE category = 'right'")))
source_count = 0
left_source_count = 0
right_source_count = 0
left_sources = []
right_sources = []
for source_id, source_info in sources.items():
source_count += 1
if source_info["category"] == "left":
left_source_count += 1
source_list = left_sources
else:
right_source_count += 1
source_list = right_sources
source_list.append({
"name": source_info["name"],
"sort": source_info.get("sort", source_info["name"]),
"count": len(list(con.execute("SELECT url FROM articles WHERE source = ?", (source_id,)))),
})
left_sources.sort(key=lambda x: x["sort"])
right_sources.sort(key=lambda x: x["sort"])
left_breakdown = "\n".join([f"* {source['name']}: {source['count']}" for source in left_sources])
right_breakdown = "\n".join([f"* {source['name']}: {source['count']}" for source in right_sources])
con.commit()
con.close()
print(f"""\
This model contains a total of {article_count} articles from {source_count} sources.
## Left
{left_breakdown}
Left total: {left_article_count} articles from {left_source_count} sources
## Right
{right_breakdown}
Right total: {right_article_count} articles from {right_source_count} sources""")

View File

@ -1,22 +0,0 @@
#!/bin/bash
total=$(sqlite3 articles.db "SELECT url FROM articles" | wc -l)
left=$(sqlite3 articles.db "SELECT url FROM articles WHERE category = 'left'" | wc -l)
right=$(sqlite3 articles.db "SELECT url FROM articles WHERE category = 'right'" | wc -l)
left_sources=$(sqlite3 articles.db "SELECT source FROM articles WHERE category = 'left'" | sort | uniq)
right_sources=$(sqlite3 articles.db "SELECT source FROM articles WHERE category = 'right'" | sort | uniq)
echo "This model contains a total of $total articles ($left left, $right right)."
echo ""
echo "## Left"
echo ""
for i in $left_sources; do
echo "* $i: $(sqlite3 articles.db "SELECT url FROM articles WHERE source = '$i'" | wc -l)"
done
echo ""
echo "## Right"
echo ""
for i in $right_sources; do
echo "* $i: $(sqlite3 articles.db "SELECT url FROM articles WHERE source = '$i'" | wc -l)"
done