pkmntcg-backend/pythonScripts/download_baseset.py

import os
import re
import requests
from bs4 import BeautifulSoup

HTML_FILE = "baseset.html"
DOWNLOAD_FOLDER = "baseset"

SET_MAP = {
    "Basis-Set": 1,
}

os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)

with open(HTML_FILE, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

entries = []

for a in soup.find_all("a"):
    url = a.get("href", "")
    title = a.get("data-elementor-lightbox-title")

    if not title or not url.lower().endswith(".jpg"):
        continue

    # We only want the original (no -427x600 etc)
    if re.search(r"-\d+x\d+\.jpg$", url.lower()):
        continue

    # Parse title: "Abra 43/102 - Basis-Set"
    m = re.match(r"(.+)\s+(\d+)\/(\d+)\s*-\s*(.+)", title.strip())
    if not m:
        print(f"Skipping unmatched title format: {title}")
        continue

    name, card, total, set_name = m.groups()
    card = int(card)

    if set_name not in SET_MAP:
        print(f"Unknown set: {set_name}, please map it.")
        continue

    set_num = SET_MAP[set_name]
    new_filename = f"base{set_num}-{card}.jpg"

    entries.append((url, new_filename))

print(f"Found {len(entries)} images to download.")

for url, filename in entries:
    filepath = os.path.join(DOWNLOAD_FOLDER, filename)
    print(f"Downloading {filename} from {url}")

    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
    except Exception as e:
        print(f"  Failed: {e}")
        continue

    with open(filepath, "wb") as f:
        f.write(r.content)

print("Done!")