You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
107 lines
3.4 KiB
107 lines
3.4 KiB
#!/usr/bin/env python3 |
|
# SPDX-License-Identifier: GPL-3.0-or-later |
|
|
|
# Copyright (c) 2022 Samuel L Sloniker |
|
# |
|
# This program is free software: you can redistribute it and/or modify it under |
|
# the terms of the GNU General Public License as published by the Free Software |
|
# Foundation, either version 3 of the License, or (at your option) any later |
|
# version. |
|
# |
|
# This program is distributed in the hope that it will be useful, but WITHOUT |
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
|
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more |
|
# details. |
|
# |
|
# You should have received a copy of the GNU General Public License along with |
|
# this program. If not, see <https://www.gnu.org/licenses/>. |
|
|
|
import feedparser |
|
import sqlite3 |
|
import goose3 |
|
import tomli |
|
import bs4 |
|
|
|
|
|
def matches(string, checks): |
|
for check in checks: |
|
if check["type"] == "startswith" and string.startswith( |
|
check["pattern"] |
|
): |
|
return True |
|
return False |
|
|
|
|
|
with open("sources.toml", "rb") as f: |
|
sources = tomli.load(f) |
|
|
|
g = goose3.Goose() |
|
|
|
con = sqlite3.connect("articles.db") |
|
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);") |
|
|
|
|
|
known = [i[0] for i in con.execute("SELECT url FROM articles;")] |
|
|
|
try: |
|
for (url,) in con.execute("SELECT url FROM articles WHERE (text = '***');"): |
|
print(f"Downloading {url}...") |
|
con.execute( |
|
"UPDATE articles SET text = ? WHERE (url = ?);", |
|
( |
|
g.extract(url).cleaned_text, |
|
url, |
|
), |
|
) |
|
print("Done downloading.") |
|
|
|
for name, config in sources.items(): |
|
url = config["feed"] |
|
category = config["category"] |
|
contains_articles = config["contains_articles"] |
|
|
|
print(f"Fetching {name} feed ({url})...") |
|
entries = [ |
|
entry |
|
for entry in feedparser.parse(url)["entries"] |
|
if not entry["link"] in known |
|
and not matches(entry["link"], config.get("exclude", [])) |
|
] |
|
print(f"Fetched feed. Found {len(entries)} new articles.") |
|
|
|
if contains_articles: |
|
for entry in entries: |
|
try: |
|
print(f"Saving {entry['link']}") |
|
con.execute( |
|
"INSERT INTO articles VALUES (?, ?, ?, ?);", |
|
( |
|
name, |
|
category, |
|
entry["link"], |
|
bs4.BeautifulSoup( |
|
entry["content"][0]["value"], features="lxml" |
|
).text, |
|
), |
|
) |
|
except KeyError: |
|
print("Not enough information. Skipping.") |
|
else: |
|
for entry in entries: |
|
try: |
|
print(f"Downloading {entry['link']}...") |
|
con.execute( |
|
"INSERT INTO articles VALUES (?, ?, ?, ?);", |
|
( |
|
name, |
|
category, |
|
entry["link"], |
|
g.extract(entry["link"]).cleaned_text, |
|
), |
|
) |
|
print(f"Done downloading.") |
|
except KeyError: |
|
print("Not enough information. Skipping.") |
|
finally: |
|
con.commit() |
|
con.close()
|
|
|