Add ability to exclude articles by URL

This commit is contained in:
Samuel Sloniker 2022-11-24 12:23:36 -08:00
parent 9562634a03
commit 7bf17b150e
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62
2 changed files with 42 additions and 24 deletions

View File

@ -6,6 +6,14 @@ import tomli
import gptc
import bs4
def matches(string, checks):
for check in checks:
if check["type"] == "startswith" and string.startswith(check["pattern"]):
return True
return False
with open("sources.toml", "rb") as f:
sources = tomli.load(f)
@ -38,12 +46,13 @@ try:
entries = [
entry
for entry in feedparser.parse(url)["entries"]
if not entry["link"] in known
if not entry["link"] in known and not matches(entry["link"], config.get("exclude", []))
]
print(f"Fetched feed. Found {len(entries)} new articles.")
if contains_articles:
for entry in entries:
try:
print(f"Saving {entry['link']}")
con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);",
@ -56,8 +65,11 @@ try:
).text,
),
)
except KeyError:
print("Not enough information. Skipping.")
else:
for entry in entries:
try:
print(f"Downloading {entry['link']}...")
con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);",
@ -69,6 +81,8 @@ try:
),
)
print(f"Done downloading.")
except KeyError:
print("Not enough information. Skipping.")
finally:
con.commit()

View File

@ -49,6 +49,10 @@ contains_articles=true
feed="https://nypost.com/feed"
category="right"
contains_articles=false
exclude=[
{ type="startswith", pattern="https://pagesix.com" },
{ type="startswith", pattern="https://decider.com" },
]
[federalist]
feed="https://thefederalist.com/feed"