import re from bs4 import BeautifulSoup import tkinter as tk from tkinter import filedialog def main(): # Tkinter file dialog to pick your saved HTML file root = tk.Tk() root.withdraw() html_file = filedialog.askopenfilename( title="Select the saved HTML file", filetypes=[("HTML files", "*.html"), ("All files", "*.*")] ) if not html_file: print("No file selected.") return # Read the HTML file with open(html_file, encoding="utf-8") as f: html = f.read() soup = BeautifulSoup(html, "html.parser") # Find the curriculum container outline_div = soup.find("div", class_="product-outline") if not outline_div: print("No
found.") return # Loop through all anchors for a in outline_div.find_all("a", href=True): match = re.search(r"/categories/(\d+)/posts/(\d+)", a["href"]) if match: category_id, post_id = match.groups() # Remove any icon tags before extracting text for tag in a.find_all(["i", "svg"]): tag.decompose() title = a.get_text(strip=True) print(f"Found: {a['href']} | Title: {title} | Category: {category_id} | Post: {post_id}") if __name__ == "__main__": main()