Add ability to exclude articles by URL
This commit is contained in:
parent
9562634a03
commit
7bf17b150e
62
download.py
62
download.py
|
@ -6,6 +6,14 @@ import tomli
|
||||||
import gptc
|
import gptc
|
||||||
import bs4
|
import bs4
|
||||||
|
|
||||||
|
|
||||||
|
def matches(string, checks):
|
||||||
|
for check in checks:
|
||||||
|
if check["type"] == "startswith" and string.startswith(check["pattern"]):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
with open("sources.toml", "rb") as f:
|
with open("sources.toml", "rb") as f:
|
||||||
sources = tomli.load(f)
|
sources = tomli.load(f)
|
||||||
|
|
||||||
|
@ -38,37 +46,43 @@ try:
|
||||||
entries = [
|
entries = [
|
||||||
entry
|
entry
|
||||||
for entry in feedparser.parse(url)["entries"]
|
for entry in feedparser.parse(url)["entries"]
|
||||||
if not entry["link"] in known
|
if not entry["link"] in known and not matches(entry["link"], config.get("exclude", []))
|
||||||
]
|
]
|
||||||
print(f"Fetched feed. Found {len(entries)} new articles.")
|
print(f"Fetched feed. Found {len(entries)} new articles.")
|
||||||
|
|
||||||
if contains_articles:
|
if contains_articles:
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
print(f"Saving {entry['link']}")
|
try:
|
||||||
con.execute(
|
print(f"Saving {entry['link']}")
|
||||||
"INSERT INTO articles VALUES (?, ?, ?, ?);",
|
con.execute(
|
||||||
(
|
"INSERT INTO articles VALUES (?, ?, ?, ?);",
|
||||||
name,
|
(
|
||||||
category,
|
name,
|
||||||
entry["link"],
|
category,
|
||||||
bs4.BeautifulSoup(
|
entry["link"],
|
||||||
entry["content"][0]["value"], features="lxml"
|
bs4.BeautifulSoup(
|
||||||
).text,
|
entry["content"][0]["value"], features="lxml"
|
||||||
),
|
).text,
|
||||||
)
|
),
|
||||||
|
)
|
||||||
|
except KeyError:
|
||||||
|
print("Not enough information. Skipping.")
|
||||||
else:
|
else:
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
print(f"Downloading {entry['link']}...")
|
try:
|
||||||
con.execute(
|
print(f"Downloading {entry['link']}...")
|
||||||
"INSERT INTO articles VALUES (?, ?, ?, ?);",
|
con.execute(
|
||||||
(
|
"INSERT INTO articles VALUES (?, ?, ?, ?);",
|
||||||
name,
|
(
|
||||||
category,
|
name,
|
||||||
entry["link"],
|
category,
|
||||||
g.extract(entry["link"]).cleaned_text,
|
entry["link"],
|
||||||
),
|
g.extract(entry["link"]).cleaned_text,
|
||||||
)
|
),
|
||||||
print(f"Done downloading.")
|
)
|
||||||
|
print(f"Done downloading.")
|
||||||
|
except KeyError:
|
||||||
|
print("Not enough information. Skipping.")
|
||||||
finally:
|
finally:
|
||||||
con.commit()
|
con.commit()
|
||||||
|
|
||||||
|
|
|
@ -49,6 +49,10 @@ contains_articles=true
|
||||||
feed="https://nypost.com/feed"
|
feed="https://nypost.com/feed"
|
||||||
category="right"
|
category="right"
|
||||||
contains_articles=false
|
contains_articles=false
|
||||||
|
exclude=[
|
||||||
|
{ type="startswith", pattern="https://pagesix.com" },
|
||||||
|
{ type="startswith", pattern="https://decider.com" },
|
||||||
|
]
|
||||||
|
|
||||||
[federalist]
|
[federalist]
|
||||||
feed="https://thefederalist.com/feed"
|
feed="https://thefederalist.com/feed"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user