Add ability to exclude articles by URL

2022-11-24 12:23:36 -08:00 · 2022-11-24 12:23:36 -08:00 · 7bf17b150e
commit 7bf17b150e
parent 9562634a03
2 changed files with 42 additions and 24 deletions
--- a/download.py
+++ b/download.py
@ -6,6 +6,14 @@ import tomli
 import gptc
 import bs4
 def matches(string, checks):
    for check in checks:
        if check["type"] == "startswith" and string.startswith(check["pattern"]):
            return True
    return False
 with open("sources.toml", "rb") as f:
    sources = tomli.load(f)
@ -38,37 +46,43 @@ try:
        entries = [
            entry
            for entry in feedparser.parse(url)["entries"]
-            if not entry["link"] in known
+            if not entry["link"] in known and not matches(entry["link"], config.get("exclude", []))
        ]
        print(f"Fetched feed. Found {len(entries)} new articles.")
        if contains_articles:
            for entry in entries:
-                print(f"Saving {entry['link']}")
+                try:
-                con.execute(
+                    print(f"Saving {entry['link']}")
-                    "INSERT INTO articles VALUES (?, ?, ?, ?);",
+                    con.execute(
-                    (
+                        "INSERT INTO articles VALUES (?, ?, ?, ?);",
-                        name,
+                        (
-                        category,
+                            name,
-                        entry["link"],
+                            category,
-                        bs4.BeautifulSoup(
+                            entry["link"],
-                            entry["content"][0]["value"], features="lxml"
+                            bs4.BeautifulSoup(
-                        ).text,
+                                entry["content"][0]["value"], features="lxml"
-                    ),
+                            ).text,
-                )
+                        ),
                    )
                except KeyError:
                    print("Not enough information. Skipping.")
        else:
            for entry in entries:
-                print(f"Downloading {entry['link']}...")
+                try:
-                con.execute(
+                    print(f"Downloading {entry['link']}...")
-                    "INSERT INTO articles VALUES (?, ?, ?, ?);",
+                    con.execute(
-                    (
+                        "INSERT INTO articles VALUES (?, ?, ?, ?);",
-                        name,
+                        (
-                        category,
+                            name,
-                        entry["link"],
+                            category,
-                        g.extract(entry["link"]).cleaned_text,
+                            entry["link"],
-                    ),
+                            g.extract(entry["link"]).cleaned_text,
-                )
+                        ),
-                print(f"Done downloading.")
+                    )
                    print(f"Done downloading.")
                except KeyError:
                    print("Not enough information. Skipping.")
 finally:
    con.commit()
--- a/sources.toml
+++ b/sources.toml
@ -49,6 +49,10 @@ contains_articles=true
 feed="https://nypost.com/feed"
 category="right"
 contains_articles=false
 exclude=[
    { type="startswith", pattern="https://pagesix.com" },
    { type="startswith", pattern="https://decider.com" },
 ]
 [federalist]
 feed="https://thefederalist.com/feed"