restructure backend

This commit is contained in:
2026-01-16 18:07:23 +01:00
parent ebcc33b1f7
commit cd9d2e9900
21 changed files with 4323 additions and 69 deletions

View File

@@ -0,0 +1,61 @@
import os
import numpy as np
from PIL import Image
import torch
import open_clip
import faiss
# --- Configuration ---
CARDS_FOLDER = "cards_old"
EMBEDDINGS_FILE = "embeddings.npy"
IDS_FILE = "ids.npy"
FAISS_INDEX_FILE = "card_index.faiss"
# --- Device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
# --- Load CLIP model ---
model, _, preprocess = open_clip.create_model_and_transforms(
'ViT-L-14', pretrained='laion2b_s32b_b82k'
)
model = model.to(device).eval()
# --- Helper: encode image ---
def encode_image(path):
img = Image.open(path).convert("RGB")
with torch.no_grad():
emb = model.encode_image(preprocess(img).unsqueeze(0).to(device))
return emb.cpu().numpy()
# --- Build embeddings ---
embeddings = []
ids = []
for fname in os.listdir(CARDS_FOLDER):
if fname.lower().endswith((".jpg", ".png")):
path = os.path.join(CARDS_FOLDER, fname)
emb = encode_image(path)
embeddings.append(emb)
ids.append(fname)
print("Encoded:", fname)
embeddings = np.vstack(embeddings)
# --- Save embeddings & IDs ---
np.save(EMBEDDINGS_FILE, embeddings)
np.save(IDS_FILE, np.array(ids))
print("Saved embeddings and IDs.")
# --- Normalize embeddings ---
faiss.normalize_L2(embeddings)
# --- Build FAISS index ---
d = embeddings.shape[1] # embedding dimension
index = faiss.IndexFlatIP(d) # inner product = cosine similarity
index.add(embeddings)
print("FAISS index built with", index.ntotal, "cards.")
# --- Save FAISS index ---
faiss.write_index(index, FAISS_INDEX_FILE)
print("FAISS index saved:", FAISS_INDEX_FILE)

View File

@@ -0,0 +1,65 @@
import os
import re
import requests
from bs4 import BeautifulSoup
HTML_FILE = "baseset.html"
DOWNLOAD_FOLDER = "baseset"
SET_MAP = {
"Basis-Set": 1,
}
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
with open(HTML_FILE, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
entries = []
for a in soup.find_all("a"):
url = a.get("href", "")
title = a.get("data-elementor-lightbox-title")
if not title or not url.lower().endswith(".jpg"):
continue
# We only want the original (no -427x600 etc)
if re.search(r"-\d+x\d+\.jpg$", url.lower()):
continue
# Parse title: "Abra 43/102 - Basis-Set"
m = re.match(r"(.+)\s+(\d+)\/(\d+)\s*-\s*(.+)", title.strip())
if not m:
print(f"Skipping unmatched title format: {title}")
continue
name, card, total, set_name = m.groups()
card = int(card)
if set_name not in SET_MAP:
print(f"Unknown set: {set_name}, please map it.")
continue
set_num = SET_MAP[set_name]
new_filename = f"base{set_num}-{card}.jpg"
entries.append((url, new_filename))
print(f"Found {len(entries)} images to download.")
for url, filename in entries:
filepath = os.path.join(DOWNLOAD_FOLDER, filename)
print(f"Downloading {filename} from {url}")
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
except Exception as e:
print(f" Failed: {e}")
continue
with open(filepath, "wb") as f:
f.write(r.content)
print("Done!")

View File

@@ -0,0 +1,56 @@
import os
import requests
from time import sleep
# --- Configuration ---
TCGDEX_API = "https://api.tcgdex.net/v2/de/cards"
OUTPUT_FOLDER = "cards"
REQUEST_DELAY = 0.1 # seconds between requests to avoid rate limiting
# Create output folder if not exists
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Fetch card list from TCGdex
print("Fetching card list...")
resp = requests.get(TCGDEX_API)
if resp.status_code != 200:
raise Exception(f"Failed to fetch card list: {resp.status_code}")
cards = resp.json()
print(f"Total cards fetched: {len(cards)}")
# Download each card image
for card in cards:
card_id = card.get("id", None)
image_base = card.get("image", None)
if not card_id:
print("Skipping card with missing ID:", card)
continue
if not image_base:
print(f"No image URL for {card_id}, skipping...")
continue
image_url = image_base + "/high.png"
output_path = os.path.join(OUTPUT_FOLDER, f"{card_id}.png")
# Skip if already downloaded
if os.path.exists(output_path):
print(f"Already exists: {card_id}")
continue
try:
r = requests.get(image_url, stream=True)
if r.status_code == 200:
with open(output_path, "wb") as f:
for chunk in r.iter_content(1024):
f.write(chunk)
print(f"Downloaded: {card_id}")
else:
print(f"Failed to download {card_id}: HTTP {r.status_code}")
except Exception as e:
print(f"Error downloading {card_id}: {e}")
sleep(REQUEST_DELAY) # small delay to be polite
print("All done!")

View File

@@ -0,0 +1,37 @@
import os
import requests
from time import sleep
# --- Configuration ---
TCGDEX_API = "https://api.tcgdex.net/v2/de/cards"
OUTPUT_FOLDER = "names"
REQUEST_DELAY = 0.1 # seconds between requests to avoid rate limiting
# Create output folder if not exists
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Fetch card list from TCGdex
print("Fetching card list...")
resp = requests.get(TCGDEX_API)
if resp.status_code != 200:
raise Exception(f"Failed to fetch card list: {resp.status_code}")
cards = resp.json()
print(f"Total cards fetched: {len(cards)}")
names = set() # using a set avoids duplicates automatically
for card in cards:
card_name = card.get("name")
if not card_name:
print("Skipping card with missing name:", card)
continue
if "" in card_name:
continue
names.add(card_name) # set ignores duplicates
output_path = os.path.join(OUTPUT_FOLDER, "name.txt")
with open(output_path, "w", encoding="utf-8") as f:
for name in names:
f.write("'" + name + "',\n")
print(f"Wrote {len(names)} unique names to {output_path}")