Add ability to exclude articles by URL

This commit is contained in:
Samuel Sloniker 2022-11-24 12:23:36 -08:00
parent 9562634a03
commit 7bf17b150e
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62
2 changed files with 42 additions and 24 deletions

View File

@ -6,6 +6,14 @@ import tomli
import gptc import gptc
import bs4 import bs4
def matches(string, checks):
for check in checks:
if check["type"] == "startswith" and string.startswith(check["pattern"]):
return True
return False
with open("sources.toml", "rb") as f: with open("sources.toml", "rb") as f:
sources = tomli.load(f) sources = tomli.load(f)
@ -38,12 +46,13 @@ try:
entries = [ entries = [
entry entry
for entry in feedparser.parse(url)["entries"] for entry in feedparser.parse(url)["entries"]
if not entry["link"] in known if not entry["link"] in known and not matches(entry["link"], config.get("exclude", []))
] ]
print(f"Fetched feed. Found {len(entries)} new articles.") print(f"Fetched feed. Found {len(entries)} new articles.")
if contains_articles: if contains_articles:
for entry in entries: for entry in entries:
try:
print(f"Saving {entry['link']}") print(f"Saving {entry['link']}")
con.execute( con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);", "INSERT INTO articles VALUES (?, ?, ?, ?);",
@ -56,8 +65,11 @@ try:
).text, ).text,
), ),
) )
except KeyError:
print("Not enough information. Skipping.")
else: else:
for entry in entries: for entry in entries:
try:
print(f"Downloading {entry['link']}...") print(f"Downloading {entry['link']}...")
con.execute( con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);", "INSERT INTO articles VALUES (?, ?, ?, ?);",
@ -69,6 +81,8 @@ try:
), ),
) )
print(f"Done downloading.") print(f"Done downloading.")
except KeyError:
print("Not enough information. Skipping.")
finally: finally:
con.commit() con.commit()

View File

@ -49,6 +49,10 @@ contains_articles=true
feed="https://nypost.com/feed" feed="https://nypost.com/feed"
category="right" category="right"
contains_articles=false contains_articles=false
exclude=[
{ type="startswith", pattern="https://pagesix.com" },
{ type="startswith", pattern="https://decider.com" },
]
[federalist] [federalist]
feed="https://thefederalist.com/feed" feed="https://thefederalist.com/feed"