From 7bf17b150e792f574e1ea6389ccf73bce603892d Mon Sep 17 00:00:00 2001 From: Samuel Sloniker Date: Thu, 24 Nov 2022 12:23:36 -0800 Subject: [PATCH] Add ability to exclude articles by URL --- download.py | 62 ++++++++++++++++++++++++++++++++-------------------- sources.toml | 4 ++++ 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/download.py b/download.py index 5f27f67..619159d 100644 --- a/download.py +++ b/download.py @@ -6,6 +6,14 @@ import tomli import gptc import bs4 + +def matches(string, checks): + for check in checks: + if check["type"] == "startswith" and string.startswith(check["pattern"]): + return True + return False + + with open("sources.toml", "rb") as f: sources = tomli.load(f) @@ -38,37 +46,43 @@ try: entries = [ entry for entry in feedparser.parse(url)["entries"] - if not entry["link"] in known + if not entry["link"] in known and not matches(entry["link"], config.get("exclude", [])) ] print(f"Fetched feed. Found {len(entries)} new articles.") if contains_articles: for entry in entries: - print(f"Saving {entry['link']}") - con.execute( - "INSERT INTO articles VALUES (?, ?, ?, ?);", - ( - name, - category, - entry["link"], - bs4.BeautifulSoup( - entry["content"][0]["value"], features="lxml" - ).text, - ), - ) + try: + print(f"Saving {entry['link']}") + con.execute( + "INSERT INTO articles VALUES (?, ?, ?, ?);", + ( + name, + category, + entry["link"], + bs4.BeautifulSoup( + entry["content"][0]["value"], features="lxml" + ).text, + ), + ) + except KeyError: + print("Not enough information. Skipping.") else: for entry in entries: - print(f"Downloading {entry['link']}...") - con.execute( - "INSERT INTO articles VALUES (?, ?, ?, ?);", - ( - name, - category, - entry["link"], - g.extract(entry["link"]).cleaned_text, - ), - ) - print(f"Done downloading.") + try: + print(f"Downloading {entry['link']}...") + con.execute( + "INSERT INTO articles VALUES (?, ?, ?, ?);", + ( + name, + category, + entry["link"], + g.extract(entry["link"]).cleaned_text, + ), + ) + print(f"Done downloading.") + except KeyError: + print("Not enough information. Skipping.") finally: con.commit() diff --git a/sources.toml b/sources.toml index 5b9a732..cde044b 100644 --- a/sources.toml +++ b/sources.toml @@ -49,6 +49,10 @@ contains_articles=true feed="https://nypost.com/feed" category="right" contains_articles=false +exclude=[ + { type="startswith", pattern="https://pagesix.com" }, + { type="startswith", pattern="https://decider.com" }, +] [federalist] feed="https://thefederalist.com/feed"