2022-11-24 09:07:06 -08:00
|
|
|
import feedparser
|
|
|
|
import sqlite3
|
|
|
|
import goose3
|
|
|
|
import tomli
|
|
|
|
import bs4
|
|
|
|
|
2022-11-24 12:23:36 -08:00
|
|
|
|
|
|
|
def matches(string, checks):
|
|
|
|
for check in checks:
|
|
|
|
if check["type"] == "startswith" and string.startswith(check["pattern"]):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2022-11-24 09:07:06 -08:00
|
|
|
with open("sources.toml", "rb") as f:
|
|
|
|
sources = tomli.load(f)
|
|
|
|
|
|
|
|
g = goose3.Goose()
|
|
|
|
|
|
|
|
con = sqlite3.connect("articles.db")
|
|
|
|
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
|
|
|
|
|
|
|
|
|
|
|
|
known = [i[0] for i in con.execute("SELECT url FROM articles;")]
|
|
|
|
|
|
|
|
try:
|
|
|
|
for (url,) in con.execute("SELECT url FROM articles WHERE (text = '***');"):
|
|
|
|
print(f"Downloading {url}...")
|
|
|
|
con.execute(
|
|
|
|
"UPDATE articles SET text = ? WHERE (url = ?);",
|
|
|
|
(
|
|
|
|
g.extract(url).cleaned_text,
|
|
|
|
url,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
print("Done downloading.")
|
|
|
|
|
|
|
|
for name, config in sources.items():
|
|
|
|
url = config["feed"]
|
|
|
|
category = config["category"]
|
|
|
|
contains_articles = config["contains_articles"]
|
|
|
|
|
|
|
|
print(f"Fetching {name} feed ({url})...")
|
|
|
|
entries = [
|
|
|
|
entry
|
|
|
|
for entry in feedparser.parse(url)["entries"]
|
2022-11-24 12:23:36 -08:00
|
|
|
if not entry["link"] in known and not matches(entry["link"], config.get("exclude", []))
|
2022-11-24 09:07:06 -08:00
|
|
|
]
|
|
|
|
print(f"Fetched feed. Found {len(entries)} new articles.")
|
|
|
|
|
|
|
|
if contains_articles:
|
|
|
|
for entry in entries:
|
2022-11-24 12:23:36 -08:00
|
|
|
try:
|
|
|
|
print(f"Saving {entry['link']}")
|
|
|
|
con.execute(
|
|
|
|
"INSERT INTO articles VALUES (?, ?, ?, ?);",
|
|
|
|
(
|
|
|
|
name,
|
|
|
|
category,
|
|
|
|
entry["link"],
|
|
|
|
bs4.BeautifulSoup(
|
|
|
|
entry["content"][0]["value"], features="lxml"
|
|
|
|
).text,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
except KeyError:
|
|
|
|
print("Not enough information. Skipping.")
|
2022-11-24 09:07:06 -08:00
|
|
|
else:
|
|
|
|
for entry in entries:
|
2022-11-24 12:23:36 -08:00
|
|
|
try:
|
|
|
|
print(f"Downloading {entry['link']}...")
|
|
|
|
con.execute(
|
|
|
|
"INSERT INTO articles VALUES (?, ?, ?, ?);",
|
|
|
|
(
|
|
|
|
name,
|
|
|
|
category,
|
|
|
|
entry["link"],
|
|
|
|
g.extract(entry["link"]).cleaned_text,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
print(f"Done downloading.")
|
|
|
|
except KeyError:
|
|
|
|
print("Not enough information. Skipping.")
|
2022-11-24 09:07:06 -08:00
|
|
|
finally:
|
|
|
|
con.commit()
|
|
|
|
con.close()
|