#!/usr/bin/env python3 # SPDX-License-Identifier: GPL-3.0-or-later # Copyright (c) 2022 Samuel L Sloniker # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, either version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along with # this program. If not, see . import feedparser import sqlite3 import goose3 import tomli import bs4 def matches(string, checks): for check in checks: if check["type"] == "startswith" and string.startswith( check["pattern"] ): return True return False with open("sources.toml", "rb") as f: sources = tomli.load(f) g = goose3.Goose() con = sqlite3.connect("articles.db") con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);") known = [i[0] for i in con.execute("SELECT url FROM articles;")] try: for (url,) in con.execute("SELECT url FROM articles WHERE (text = '***');"): print(f"Downloading {url}...") con.execute( "UPDATE articles SET text = ? WHERE (url = ?);", ( g.extract(url).cleaned_text, url, ), ) print("Done downloading.") for name, config in sources.items(): url = config["feed"] category = config["category"] contains_articles = config["contains_articles"] print(f"Fetching {name} feed ({url})...") entries = [ entry for entry in feedparser.parse(url)["entries"] if not entry["link"] in known and not matches(entry["link"], config.get("exclude", [])) ] print(f"Fetched feed. Found {len(entries)} new articles.") if contains_articles: for entry in entries: try: print(f"Saving {entry['link']}") con.execute( "INSERT INTO articles VALUES (?, ?, ?, ?);", ( name, category, entry["link"], bs4.BeautifulSoup( entry["content"][0]["value"], features="lxml" ).text, ), ) except KeyError: print("Not enough information. Skipping.") else: for entry in entries: try: print(f"Downloading {entry['link']}...") con.execute( "INSERT INTO articles VALUES (?, ?, ?, ?);", ( name, category, entry["link"], g.extract(entry["link"]).cleaned_text, ), ) print(f"Done downloading.") except KeyError: print("Not enough information. Skipping.") finally: con.commit() con.close()