import re
from bs4 import BeautifulSoup
import tkinter as tk
from tkinter import filedialog

def main():
    # Tkinter file dialog to pick your saved HTML file
    root = tk.Tk()
    root.withdraw()
    html_file = filedialog.askopenfilename(
        title="Select the saved HTML file",
        filetypes=[("HTML files", "*.html"), ("All files", "*.*")]
    )

    if not html_file:
        print("No file selected.")
        return

    # Read the HTML file
    with open(html_file, encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")

    # Find the curriculum container
    outline_div = soup.find("div", class_="product-outline")
    if not outline_div:
        print("No <div class='product-outline'> found.")
        return

    # Loop through all anchors
    for a in outline_div.find_all("a", href=True):
        match = re.search(r"/categories/(\d+)/posts/(\d+)", a["href"])
        if match:
            category_id, post_id = match.groups()
            # Remove any icon tags before extracting text
            for tag in a.find_all(["i", "svg"]):
                tag.decompose()
            title = a.get_text(strip=True)
            print(f"Found: {a['href']} | Title: {title} | Category: {category_id} | Post: {post_id}")

if __name__ == "__main__":
    main()