#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import asyncio, aiohttp, aiofiles, re, sys, time
from pathlib import Path
from bs4 import BeautifulSoup

START_YEAR = 1923
END_YEAR   = 2015          # legacy archive range
OUT_DIR    = Path("time_covers")
SIZE_SUFFIX = "_400.jpg"   # common size in legacy archive; try "_720.jpg" later if you verify availability
CONCURRENCY = 6            # be nice
PAUSE_SEC   = 0.5          # polite delay between requests

COVERSEARCH_URL = "https://content.time.com/time/coversearch/0,16871,,00.html"
# Each cover page looks like: https://content.time.com/time/covers/0,16641,YYYYMMDD,00.html
# On that page there is an <a> that points to the direct image:
# https://content.time.com/time/magazine/archive/covers/YYYY/1101YYYYMMDD_400.jpg

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; cover-collector/1.0; +https://example.org)"
}

async def fetch(session, url):
    for attempt in range(3):
        try:
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
                if resp.status == 200:
                    return await resp.text()
                elif resp.status in (403, 404):
                    return None
        except Exception:
            await asyncio.sleep(1 + attempt)
    return None

async def fetch_bytes(session, url):
    for attempt in range(3):
        try:
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as resp:
                if resp.status == 200:
                    return await resp.read()
                elif resp.status in (403, 404):
                    return None
        except Exception:
            await asyncio.sleep(1 + attempt)
    return None

def extract_coverpage_links(html, years_range):
    soup = BeautifulSoup(html, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        # We only want legacy cover pages with YYYYMMDD in the URL
        m = re.search(r"/time/covers/0,16641,(\d{8}),00\.html", href)
        if m:
            y = int(m.group(1)[:4])
            if y in years_range:
                links.append(("https://content.time.com" + m.group(0), m.group(1)))
    # de-dupe, keep stable order
    seen = set(); out = []
    for u, d in links:
        if u not in seen:
            seen.add(u); out.append((u, d))
    return out

def build_image_url(date8, prefer_img_timeinc=False, size_suffix=SIZE_SUFFIX):
    yyyy = date8[:4]
    base = "https://img.timeinc.net" if prefer_img_timeinc else "https://content.time.com"
    return f"{base}/time/magazine/archive/covers/{yyyy}/1101{date8}{size_suffix}"

async def download_one(session, cover_url, date8, out_dir):
    # Fetch cover page and locate the direct image URL
    html = await fetch(session, cover_url)
    if not html:
        return f"skip (no cover page): {date8}"

    # Try to grab the explicit image link if present; otherwise fall back to pattern
    m = re.search(r'https?://[^"]+/magazine/archive/covers/\d{4}/1101' + date8 + r'_\d+\.jpg', html)
    if m:
        img_url = m.group(0)
    else:
        # fallback guess (legacy pattern)
        img_url = build_image_url(date8, prefer_img_timeinc=False)

    # Pick filename like 19520107.jpg
    out_path = out_dir / f"{date8}.jpg"
    if out_path.exists():
        return f"exists: {out_path.name}"

    data = await fetch_bytes(session, img_url)
    if data is None:
        # try the other host
        alt = build_image_url(date8, prefer_img_timeinc=True)
        data = await fetch_bytes(session, alt)
        if data is None:
            return f"missing image: {date8}"

    async with aiofiles.open(out_path, "wb") as f:
        await f.write(data)
    await asyncio.sleep(PAUSE_SEC)  # polite pause
    return f"saved:  {out_path.name}"

async def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    years = set(range(START_YEAR, END_YEAR + 1))

    async with aiohttp.ClientSession(headers=HEADERS) as session:
        # Start from the legacy coversearch landing and harvest as many cover links as we can
        html = await fetch(session, COVERSEARCH_URL)
        if not html:
            print("Could not load the legacy coversearch page; aborting.")
            return

        cover_links = extract_coverpage_links(html, years)

        # If you want deeper harvesting, you could also follow “This Week in History” and “Recent Covers”
        # which appear on that page, or seed extra pages yourself.

        # Download concurrently (bounded)
        sem = asyncio.Semaphore(CONCURRENCY)

        async def bound_dl(link):
            async with sem:
                cover_url, date8 = link
                msg = await download_one(session, cover_url, date8, OUT_DIR)
                print(msg)
                return msg

        await asyncio.gather(*(bound_dl(link) for link in cover_links))

if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        sys.exit(0)