restructure backend
This commit is contained in:
61
pythonScripts/build_index.py
Normal file
61
pythonScripts/build_index.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import os
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import torch
|
||||
import open_clip
|
||||
import faiss
|
||||
|
||||
# --- Configuration ---
|
||||
CARDS_FOLDER = "cards_old"
|
||||
EMBEDDINGS_FILE = "embeddings.npy"
|
||||
IDS_FILE = "ids.npy"
|
||||
FAISS_INDEX_FILE = "card_index.faiss"
|
||||
|
||||
# --- Device ---
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
print("Using device:", device)
|
||||
|
||||
# --- Load CLIP model ---
|
||||
model, _, preprocess = open_clip.create_model_and_transforms(
|
||||
'ViT-L-14', pretrained='laion2b_s32b_b82k'
|
||||
)
|
||||
model = model.to(device).eval()
|
||||
|
||||
# --- Helper: encode image ---
|
||||
def encode_image(path):
|
||||
img = Image.open(path).convert("RGB")
|
||||
with torch.no_grad():
|
||||
emb = model.encode_image(preprocess(img).unsqueeze(0).to(device))
|
||||
return emb.cpu().numpy()
|
||||
|
||||
# --- Build embeddings ---
|
||||
embeddings = []
|
||||
ids = []
|
||||
|
||||
for fname in os.listdir(CARDS_FOLDER):
|
||||
if fname.lower().endswith((".jpg", ".png")):
|
||||
path = os.path.join(CARDS_FOLDER, fname)
|
||||
emb = encode_image(path)
|
||||
embeddings.append(emb)
|
||||
ids.append(fname)
|
||||
print("Encoded:", fname)
|
||||
|
||||
embeddings = np.vstack(embeddings)
|
||||
|
||||
# --- Save embeddings & IDs ---
|
||||
np.save(EMBEDDINGS_FILE, embeddings)
|
||||
np.save(IDS_FILE, np.array(ids))
|
||||
print("Saved embeddings and IDs.")
|
||||
|
||||
# --- Normalize embeddings ---
|
||||
faiss.normalize_L2(embeddings)
|
||||
|
||||
# --- Build FAISS index ---
|
||||
d = embeddings.shape[1] # embedding dimension
|
||||
index = faiss.IndexFlatIP(d) # inner product = cosine similarity
|
||||
index.add(embeddings)
|
||||
print("FAISS index built with", index.ntotal, "cards.")
|
||||
|
||||
# --- Save FAISS index ---
|
||||
faiss.write_index(index, FAISS_INDEX_FILE)
|
||||
print("FAISS index saved:", FAISS_INDEX_FILE)
|
||||
65
pythonScripts/download_baseset.py
Normal file
65
pythonScripts/download_baseset.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import os
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
HTML_FILE = "baseset.html"
|
||||
DOWNLOAD_FOLDER = "baseset"
|
||||
|
||||
SET_MAP = {
|
||||
"Basis-Set": 1,
|
||||
}
|
||||
|
||||
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
|
||||
|
||||
with open(HTML_FILE, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f, "html.parser")
|
||||
|
||||
entries = []
|
||||
|
||||
for a in soup.find_all("a"):
|
||||
url = a.get("href", "")
|
||||
title = a.get("data-elementor-lightbox-title")
|
||||
|
||||
if not title or not url.lower().endswith(".jpg"):
|
||||
continue
|
||||
|
||||
# We only want the original (no -427x600 etc)
|
||||
if re.search(r"-\d+x\d+\.jpg$", url.lower()):
|
||||
continue
|
||||
|
||||
# Parse title: "Abra 43/102 - Basis-Set"
|
||||
m = re.match(r"(.+)\s+(\d+)\/(\d+)\s*-\s*(.+)", title.strip())
|
||||
if not m:
|
||||
print(f"Skipping unmatched title format: {title}")
|
||||
continue
|
||||
|
||||
name, card, total, set_name = m.groups()
|
||||
card = int(card)
|
||||
|
||||
if set_name not in SET_MAP:
|
||||
print(f"Unknown set: {set_name}, please map it.")
|
||||
continue
|
||||
|
||||
set_num = SET_MAP[set_name]
|
||||
new_filename = f"base{set_num}-{card}.jpg"
|
||||
|
||||
entries.append((url, new_filename))
|
||||
|
||||
print(f"Found {len(entries)} images to download.")
|
||||
|
||||
for url, filename in entries:
|
||||
filepath = os.path.join(DOWNLOAD_FOLDER, filename)
|
||||
print(f"Downloading {filename} from {url}")
|
||||
|
||||
try:
|
||||
r = requests.get(url, timeout=10)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f" Failed: {e}")
|
||||
continue
|
||||
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
print("Done!")
|
||||
56
pythonScripts/download_cards.py
Normal file
56
pythonScripts/download_cards.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import os
|
||||
import requests
|
||||
from time import sleep
|
||||
|
||||
# --- Configuration ---
|
||||
TCGDEX_API = "https://api.tcgdex.net/v2/de/cards"
|
||||
OUTPUT_FOLDER = "cards"
|
||||
REQUEST_DELAY = 0.1 # seconds between requests to avoid rate limiting
|
||||
|
||||
# Create output folder if not exists
|
||||
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
||||
|
||||
# Fetch card list from TCGdex
|
||||
print("Fetching card list...")
|
||||
resp = requests.get(TCGDEX_API)
|
||||
if resp.status_code != 200:
|
||||
raise Exception(f"Failed to fetch card list: {resp.status_code}")
|
||||
cards = resp.json()
|
||||
print(f"Total cards fetched: {len(cards)}")
|
||||
|
||||
# Download each card image
|
||||
for card in cards:
|
||||
card_id = card.get("id", None)
|
||||
image_base = card.get("image", None)
|
||||
|
||||
if not card_id:
|
||||
print("Skipping card with missing ID:", card)
|
||||
continue
|
||||
|
||||
if not image_base:
|
||||
print(f"No image URL for {card_id}, skipping...")
|
||||
continue
|
||||
|
||||
image_url = image_base + "/high.png"
|
||||
output_path = os.path.join(OUTPUT_FOLDER, f"{card_id}.png")
|
||||
|
||||
# Skip if already downloaded
|
||||
if os.path.exists(output_path):
|
||||
print(f"Already exists: {card_id}")
|
||||
continue
|
||||
|
||||
try:
|
||||
r = requests.get(image_url, stream=True)
|
||||
if r.status_code == 200:
|
||||
with open(output_path, "wb") as f:
|
||||
for chunk in r.iter_content(1024):
|
||||
f.write(chunk)
|
||||
print(f"Downloaded: {card_id}")
|
||||
else:
|
||||
print(f"Failed to download {card_id}: HTTP {r.status_code}")
|
||||
except Exception as e:
|
||||
print(f"Error downloading {card_id}: {e}")
|
||||
|
||||
sleep(REQUEST_DELAY) # small delay to be polite
|
||||
|
||||
print("All done!")
|
||||
37
pythonScripts/fetchNames.py
Normal file
37
pythonScripts/fetchNames.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
import requests
|
||||
from time import sleep
|
||||
|
||||
# --- Configuration ---
|
||||
TCGDEX_API = "https://api.tcgdex.net/v2/de/cards"
|
||||
OUTPUT_FOLDER = "names"
|
||||
REQUEST_DELAY = 0.1 # seconds between requests to avoid rate limiting
|
||||
|
||||
# Create output folder if not exists
|
||||
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
||||
|
||||
# Fetch card list from TCGdex
|
||||
print("Fetching card list...")
|
||||
resp = requests.get(TCGDEX_API)
|
||||
if resp.status_code != 200:
|
||||
raise Exception(f"Failed to fetch card list: {resp.status_code}")
|
||||
cards = resp.json()
|
||||
print(f"Total cards fetched: {len(cards)}")
|
||||
|
||||
names = set() # using a set avoids duplicates automatically
|
||||
|
||||
for card in cards:
|
||||
card_name = card.get("name")
|
||||
if not card_name:
|
||||
print("Skipping card with missing name:", card)
|
||||
continue
|
||||
if "◇" in card_name:
|
||||
continue
|
||||
names.add(card_name) # set ignores duplicates
|
||||
|
||||
output_path = os.path.join(OUTPUT_FOLDER, "name.txt")
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
for name in names:
|
||||
f.write("'" + name + "',\n")
|
||||
|
||||
print(f"Wrote {len(names)} unique names to {output_path}")
|
||||
Reference in New Issue
Block a user