restructure backend

2026-01-16 18:07:23 +01:00
parent ebcc33b1f7
commit cd9d2e9900
21 changed files with 4323 additions and 69 deletions
--- a/pythonScripts/build_index.py
+++ b/pythonScripts/build_index.py
@@ -0,0 +1,61 @@
+import os
+import numpy as np
+from PIL import Image
+import torch
+import open_clip
+import faiss
+
+# --- Configuration ---
+CARDS_FOLDER = "cards_old"
+EMBEDDINGS_FILE = "embeddings.npy"
+IDS_FILE = "ids.npy"
+FAISS_INDEX_FILE = "card_index.faiss"
+
+# --- Device ---
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Using device:", device)
+
+# --- Load CLIP model ---
+model, _, preprocess = open_clip.create_model_and_transforms(
+    'ViT-L-14', pretrained='laion2b_s32b_b82k'
+)
+model = model.to(device).eval()
+
+# --- Helper: encode image ---
+def encode_image(path):
+    img = Image.open(path).convert("RGB")
+    with torch.no_grad():
+        emb = model.encode_image(preprocess(img).unsqueeze(0).to(device))
+    return emb.cpu().numpy()
+
+# --- Build embeddings ---
+embeddings = []
+ids = []
+
+for fname in os.listdir(CARDS_FOLDER):
+    if fname.lower().endswith((".jpg", ".png")):
+        path = os.path.join(CARDS_FOLDER, fname)
+        emb = encode_image(path)
+        embeddings.append(emb)
+        ids.append(fname)
+        print("Encoded:", fname)
+
+embeddings = np.vstack(embeddings)
+
+# --- Save embeddings & IDs ---
+np.save(EMBEDDINGS_FILE, embeddings)
+np.save(IDS_FILE, np.array(ids))
+print("Saved embeddings and IDs.")
+
+# --- Normalize embeddings ---
+faiss.normalize_L2(embeddings)
+
+# --- Build FAISS index ---
+d = embeddings.shape[1]  # embedding dimension
+index = faiss.IndexFlatIP(d)  # inner product = cosine similarity
+index.add(embeddings)
+print("FAISS index built with", index.ntotal, "cards.")
+
+# --- Save FAISS index ---
+faiss.write_index(index, FAISS_INDEX_FILE)
+print("FAISS index saved:", FAISS_INDEX_FILE)
--- a/pythonScripts/download_baseset.py
+++ b/pythonScripts/download_baseset.py
@@ -0,0 +1,65 @@
+import os
+import re
+import requests
+from bs4 import BeautifulSoup
+
+HTML_FILE = "baseset.html"
+DOWNLOAD_FOLDER = "baseset"
+
+SET_MAP = {
+    "Basis-Set": 1,
+}
+
+os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
+
+with open(HTML_FILE, "r", encoding="utf-8") as f:
+    soup = BeautifulSoup(f, "html.parser")
+
+entries = []
+
+for a in soup.find_all("a"):
+    url = a.get("href", "")
+    title = a.get("data-elementor-lightbox-title")
+
+    if not title or not url.lower().endswith(".jpg"):
+        continue
+
+    # We only want the original (no -427x600 etc)
+    if re.search(r"-\d+x\d+\.jpg$", url.lower()):
+        continue
+
+    # Parse title: "Abra 43/102 - Basis-Set"
+    m = re.match(r"(.+)\s+(\d+)\/(\d+)\s*-\s*(.+)", title.strip())
+    if not m:
+        print(f"Skipping unmatched title format: {title}")
+        continue
+
+    name, card, total, set_name = m.groups()
+    card = int(card)
+
+    if set_name not in SET_MAP:
+        print(f"Unknown set: {set_name}, please map it.")
+        continue
+
+    set_num = SET_MAP[set_name]
+    new_filename = f"base{set_num}-{card}.jpg"
+
+    entries.append((url, new_filename))
+
+print(f"Found {len(entries)} images to download.")
+
+for url, filename in entries:
+    filepath = os.path.join(DOWNLOAD_FOLDER, filename)
+    print(f"Downloading {filename} from {url}")
+
+    try:
+        r = requests.get(url, timeout=10)
+        r.raise_for_status()
+    except Exception as e:
+        print(f"  Failed: {e}")
+        continue
+
+    with open(filepath, "wb") as f:
+        f.write(r.content)
+
+print("Done!")
--- a/pythonScripts/download_cards.py
+++ b/pythonScripts/download_cards.py
@@ -0,0 +1,56 @@
+import os
+import requests
+from time import sleep
+
+# --- Configuration ---
+TCGDEX_API = "https://api.tcgdex.net/v2/de/cards"
+OUTPUT_FOLDER = "cards"
+REQUEST_DELAY = 0.1  # seconds between requests to avoid rate limiting
+
+# Create output folder if not exists
+os.makedirs(OUTPUT_FOLDER, exist_ok=True)
+
+# Fetch card list from TCGdex
+print("Fetching card list...")
+resp = requests.get(TCGDEX_API)
+if resp.status_code != 200:
+    raise Exception(f"Failed to fetch card list: {resp.status_code}")
+cards = resp.json()
+print(f"Total cards fetched: {len(cards)}")
+
+# Download each card image
+for card in cards:
+    card_id = card.get("id", None)
+    image_base = card.get("image", None)
+
+    if not card_id:
+        print("Skipping card with missing ID:", card)
+        continue
+
+    if not image_base:
+        print(f"No image URL for {card_id}, skipping...")
+        continue
+
+    image_url = image_base + "/high.png"
+    output_path = os.path.join(OUTPUT_FOLDER, f"{card_id}.png")
+
+    # Skip if already downloaded
+    if os.path.exists(output_path):
+        print(f"Already exists: {card_id}")
+        continue
+
+    try:
+        r = requests.get(image_url, stream=True)
+        if r.status_code == 200:
+            with open(output_path, "wb") as f:
+                for chunk in r.iter_content(1024):
+                    f.write(chunk)
+            print(f"Downloaded: {card_id}")
+        else:
+            print(f"Failed to download {card_id}: HTTP {r.status_code}")
+    except Exception as e:
+        print(f"Error downloading {card_id}: {e}")
+
+    sleep(REQUEST_DELAY)  # small delay to be polite
+
+print("All done!")
--- a/pythonScripts/fetchNames.py
+++ b/pythonScripts/fetchNames.py
@@ -0,0 +1,37 @@
+import os
+import requests
+from time import sleep
+
+# --- Configuration ---
+TCGDEX_API = "https://api.tcgdex.net/v2/de/cards"
+OUTPUT_FOLDER = "names"
+REQUEST_DELAY = 0.1  # seconds between requests to avoid rate limiting
+
+# Create output folder if not exists
+os.makedirs(OUTPUT_FOLDER, exist_ok=True)
+
+# Fetch card list from TCGdex
+print("Fetching card list...")
+resp = requests.get(TCGDEX_API)
+if resp.status_code != 200:
+    raise Exception(f"Failed to fetch card list: {resp.status_code}")
+cards = resp.json()
+print(f"Total cards fetched: {len(cards)}")
+
+names = set()  # using a set avoids duplicates automatically
+
+for card in cards:
+    card_name = card.get("name")
+    if not card_name:
+        print("Skipping card with missing name:", card)
+        continue
+    if "◇" in card_name:
+        continue
+    names.add(card_name)  # set ignores duplicates
+
+output_path = os.path.join(OUTPUT_FOLDER, "name.txt")
+with open(output_path, "w", encoding="utf-8") as f:
+    for name in names:
+        f.write("'" + name + "',\n")
+
+print(f"Wrote {len(names)} unique names to {output_path}")