import feedparser
import hashlib
import sqlite3
import goose3
import tomli
import gptc
import bs4


def matches(string, checks):
    for check in checks:
        if check["type"] == "startswith" and string.startswith(check["pattern"]):
            return True
    return False


with open("sources.toml", "rb") as f:
    sources = tomli.load(f)

g = goose3.Goose()

con = sqlite3.connect("articles.db")
con.execute("CREATE TABLE IF NOT EXISTS articles(source, category, url, text);")


known = [i[0] for i in con.execute("SELECT url FROM articles;")]

try:
    for (url,) in con.execute("SELECT url FROM articles WHERE (text = '***');"):
        print(f"Downloading {url}...")
        con.execute(
            "UPDATE articles SET text = ? WHERE (url = ?);",
            (
                g.extract(url).cleaned_text,
                url,
            ),
        )
        print("Done downloading.")

    for name, config in sources.items():
        url = config["feed"]
        category = config["category"]
        contains_articles = config["contains_articles"]

        print(f"Fetching {name} feed ({url})...")
        entries = [
            entry
            for entry in feedparser.parse(url)["entries"]
            if not entry["link"] in known and not matches(entry["link"], config.get("exclude", []))
        ]
        print(f"Fetched feed. Found {len(entries)} new articles.")

        if contains_articles:
            for entry in entries:
                try:
                    print(f"Saving {entry['link']}")
                    con.execute(
                        "INSERT INTO articles VALUES (?, ?, ?, ?);",
                        (
                            name,
                            category,
                            entry["link"],
                            bs4.BeautifulSoup(
                                entry["content"][0]["value"], features="lxml"
                            ).text,
                        ),
                    )
                except KeyError:
                    print("Not enough information. Skipping.")
        else:
            for entry in entries:
                try:
                    print(f"Downloading {entry['link']}...")
                    con.execute(
                        "INSERT INTO articles VALUES (?, ?, ?, ?);",
                        (
                            name,
                            category,
                            entry["link"],
                            g.extract(entry["link"]).cleaned_text,
                        ),
                    )
                    print(f"Done downloading.")
                except KeyError:
                    print("Not enough information. Skipping.")
finally:
    con.commit()

    print("Compiling model...")

    raw_model = [
        {"text": i[0], "category": i[1]}
        for i in con.execute("SELECT text, category FROM articles;")
    ]

    with open("model.gptc", "w+b") as f:
        f.write(
            gptc.compile(raw_model, max_ngram_length=3, min_count=3).serialize()
        )

    con.close()