gptc-news-model/download.py

87 lines
2.7 KiB
Python
Raw Normal View History

2022-11-24 09:07:06 -08:00
import feedparser
import sqlite3
import goose3
import tomli
import bs4
2022-11-24 12:23:36 -08:00
def matches(string, checks):
for check in checks:
if check["type"] == "startswith" and string.startswith(check["pattern"]):
return True
return False
2022-11-24 09:07:06 -08:00
with open("sources.toml", "rb") as f:
sources = tomli.load(f)
g = goose3.Goose()
con = sqlite3.connect("articles.db")
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")
known = [i[0] for i in con.execute("SELECT url FROM articles;")]
try:
for (url,) in con.execute("SELECT url FROM articles WHERE (text = '***');"):
print(f"Downloading {url}...")
con.execute(
"UPDATE articles SET text = ? WHERE (url = ?);",
(
g.extract(url).cleaned_text,
url,
),
)
print("Done downloading.")
for name, config in sources.items():
url = config["feed"]
category = config["category"]
contains_articles = config["contains_articles"]
print(f"Fetching {name} feed ({url})...")
entries = [
entry
for entry in feedparser.parse(url)["entries"]
2022-11-24 12:23:36 -08:00
if not entry["link"] in known and not matches(entry["link"], config.get("exclude", []))
2022-11-24 09:07:06 -08:00
]
print(f"Fetched feed. Found {len(entries)} new articles.")
if contains_articles:
for entry in entries:
2022-11-24 12:23:36 -08:00
try:
print(f"Saving {entry['link']}")
con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);",
(
name,
category,
entry["link"],
bs4.BeautifulSoup(
entry["content"][0]["value"], features="lxml"
).text,
),
)
except KeyError:
print("Not enough information. Skipping.")
2022-11-24 09:07:06 -08:00
else:
for entry in entries:
2022-11-24 12:23:36 -08:00
try:
print(f"Downloading {entry['link']}...")
con.execute(
"INSERT INTO articles VALUES (?, ?, ?, ?);",
(
name,
category,
entry["link"],
g.extract(entry["link"]).cleaned_text,
),
)
print(f"Done downloading.")
except KeyError:
print("Not enough information. Skipping.")
2022-11-24 09:07:06 -08:00
finally:
con.commit()
con.close()