#!/usr/bin/env python3 # -*- coding: utf-8 -*- import asyncio, aiohttp, aiofiles, re, sys, time from pathlib import Path from bs4 import BeautifulSoup START_YEAR = 1923 END_YEAR = 2015 # legacy archive range OUT_DIR = Path("time_covers") SIZE_SUFFIX = "_400.jpg" # common size in legacy archive; try "_720.jpg" later if you verify availability CONCURRENCY = 6 # be nice PAUSE_SEC = 0.5 # polite delay between requests COVERSEARCH_URL = "https://content.time.com/time/coversearch/0,16871,,00.html" # Each cover page looks like: https://content.time.com/time/covers/0,16641,YYYYMMDD,00.html # On that page there is an that points to the direct image: # https://content.time.com/time/magazine/archive/covers/YYYY/1101YYYYMMDD_400.jpg HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; cover-collector/1.0; +https://example.org)" } async def fetch(session, url): for attempt in range(3): try: async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as resp: if resp.status == 200: return await resp.text() elif resp.status in (403, 404): return None except Exception: await asyncio.sleep(1 + attempt) return None async def fetch_bytes(session, url): for attempt in range(3): try: async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as resp: if resp.status == 200: return await resp.read() elif resp.status in (403, 404): return None except Exception: await asyncio.sleep(1 + attempt) return None def extract_coverpage_links(html, years_range): soup = BeautifulSoup(html, "html.parser") links = [] for a in soup.find_all("a", href=True): href = a["href"] # We only want legacy cover pages with YYYYMMDD in the URL m = re.search(r"/time/covers/0,16641,(\d{8}),00\.html", href) if m: y = int(m.group(1)[:4]) if y in years_range: links.append(("https://content.time.com" + m.group(0), m.group(1))) # de-dupe, keep stable order seen = set(); out = [] for u, d in links: if u not in seen: seen.add(u); out.append((u, d)) return out def build_image_url(date8, prefer_img_timeinc=False, size_suffix=SIZE_SUFFIX): yyyy = date8[:4] base = "https://img.timeinc.net" if prefer_img_timeinc else "https://content.time.com" return f"{base}/time/magazine/archive/covers/{yyyy}/1101{date8}{size_suffix}" async def download_one(session, cover_url, date8, out_dir): # Fetch cover page and locate the direct image URL html = await fetch(session, cover_url) if not html: return f"skip (no cover page): {date8}" # Try to grab the explicit image link if present; otherwise fall back to pattern m = re.search(r'https?://[^"]+/magazine/archive/covers/\d{4}/1101' + date8 + r'_\d+\.jpg', html) if m: img_url = m.group(0) else: # fallback guess (legacy pattern) img_url = build_image_url(date8, prefer_img_timeinc=False) # Pick filename like 19520107.jpg out_path = out_dir / f"{date8}.jpg" if out_path.exists(): return f"exists: {out_path.name}" data = await fetch_bytes(session, img_url) if data is None: # try the other host alt = build_image_url(date8, prefer_img_timeinc=True) data = await fetch_bytes(session, alt) if data is None: return f"missing image: {date8}" async with aiofiles.open(out_path, "wb") as f: await f.write(data) await asyncio.sleep(PAUSE_SEC) # polite pause return f"saved: {out_path.name}" async def main(): OUT_DIR.mkdir(parents=True, exist_ok=True) years = set(range(START_YEAR, END_YEAR + 1)) async with aiohttp.ClientSession(headers=HEADERS) as session: # Start from the legacy coversearch landing and harvest as many cover links as we can html = await fetch(session, COVERSEARCH_URL) if not html: print("Could not load the legacy coversearch page; aborting.") return cover_links = extract_coverpage_links(html, years) # If you want deeper harvesting, you could also follow “This Week in History” and “Recent Covers” # which appear on that page, or seed extra pages yourself. # Download concurrently (bounded) sem = asyncio.Semaphore(CONCURRENCY) async def bound_dl(link): async with sem: cover_url, date8 = link msg = await download_one(session, cover_url, date8, OUT_DIR) print(msg) return msg await asyncio.gather(*(bound_dl(link) for link in cover_links)) if __name__ == "__main__": try: asyncio.run(main()) except KeyboardInterrupt: sys.exit(0)