Add ability to exclude articles by URL

This commit is contained in:
Samuel Sloniker 2022-11-24 12:23:36 -08:00
parent 9562634a03
commit 7bf17b150e
Signed by: kj7rrv
GPG Key ID: 1BB4029E66285A62
2 changed files with 42 additions and 24 deletions

View File

@ -6,6 +6,14 @@ import tomli
import gptc import gptc
import bs4 import bs4
def matches(string, checks):
for check in checks:
if check["type"] == "startswith" and string.startswith(check["pattern"]):
return True
return False
with open("sources.toml", "rb") as f: with open("sources.toml", "rb") as f:
sources = tomli.load(f) sources = tomli.load(f)
@ -38,37 +46,43 @@ try:
entries = [ entries = [
entry entry
for entry in feedparser.parse(url)["entries"] for entry in feedparser.parse(url)["entries"]
if not entry["link"] in known if not entry["link"] in known and not matches(entry["link"], config.get("exclude", []))
] ]
print(f"Fetched feed. Found {len(entries)} new articles.") print(f"Fetched feed. Found {len(entries)} new articles.")
if contains_articles: if contains_articles:
for entry in entries: for entry in entries:
print(f"Saving {entry['link']}") try:
con.execute( print(f"Saving {entry['link']}")
"INSERT INTO articles VALUES (?, ?, ?, ?);", con.execute(
( "INSERT INTO articles VALUES (?, ?, ?, ?);",
name, (
category, name,
entry["link"], category,
bs4.BeautifulSoup( entry["link"],
entry["content"][0]["value"], features="lxml" bs4.BeautifulSoup(
).text, entry["content"][0]["value"], features="lxml"
), ).text,
) ),
)
except KeyError:
print("Not enough information. Skipping.")
else: else:
for entry in entries: for entry in entries:
print(f"Downloading {entry['link']}...") try:
con.execute( print(f"Downloading {entry['link']}...")
"INSERT INTO articles VALUES (?, ?, ?, ?);", con.execute(
( "INSERT INTO articles VALUES (?, ?, ?, ?);",
name, (
category, name,
entry["link"], category,
g.extract(entry["link"]).cleaned_text, entry["link"],
), g.extract(entry["link"]).cleaned_text,
) ),
print(f"Done downloading.") )
print(f"Done downloading.")
except KeyError:
print("Not enough information. Skipping.")
finally: finally:
con.commit() con.commit()

View File

@ -49,6 +49,10 @@ contains_articles=true
feed="https://nypost.com/feed" feed="https://nypost.com/feed"
category="right" category="right"
contains_articles=false contains_articles=false
exclude=[
{ type="startswith", pattern="https://pagesix.com" },
{ type="startswith", pattern="https://decider.com" },
]
[federalist] [federalist]
feed="https://thefederalist.com/feed" feed="https://thefederalist.com/feed"