849 lines
35 KiB
Python
849 lines
35 KiB
Python
"""
|
||
GameTora Umamusume Support Card Scraper
|
||
Scrapes all support cards with their effects at key levels (1, 25, 40, 50)
|
||
Includes character art download
|
||
"""
|
||
|
||
import sqlite3
|
||
import time
|
||
import re
|
||
import os
|
||
import requests
|
||
import sys
|
||
import os
|
||
|
||
# Ensure we can import from parent directory
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
||
from playwright.sync_api import sync_playwright
|
||
from db.db_queries import get_conn, init_database
|
||
|
||
BASE_URL = "https://gametora.com"
|
||
|
||
if getattr(sys, 'frozen', False):
|
||
# In frozen state, look in the same directory as the executable
|
||
IMAGES_PATH = os.path.join(os.path.dirname(sys.executable), "images")
|
||
else:
|
||
IMAGES_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "images")
|
||
|
||
# Key levels to scrape (limit break milestones)
|
||
KEY_LEVELS = [20, 25, 30, 35, 40, 45, 50]
|
||
|
||
# Rarity mapping based on image filename
|
||
RARITY_MAP = {
|
||
"rarity_01": "R",
|
||
"rarity_02": "SR",
|
||
"rarity_03": "SSR"
|
||
}
|
||
|
||
# Type mapping based on image filename - uses utx_ico_obtain_XX pattern!
|
||
TYPE_MAP = {
|
||
"obtain_00": "Speed",
|
||
"obtain_01": "Stamina",
|
||
"obtain_02": "Power",
|
||
"obtain_03": "Guts",
|
||
"obtain_04": "Wisdom",
|
||
"obtain_05": "Friend",
|
||
"obtain_06": "Group"
|
||
}
|
||
|
||
def scrape_all_support_links(page):
|
||
"""Scrape all support card links from the list page"""
|
||
print("Loading support card list...")
|
||
page.goto(f"{BASE_URL}/umamusume/supports", timeout=60000)
|
||
page.wait_for_load_state("networkidle")
|
||
|
||
# Tick "Show Upcoming Supports" checkbox if it exists
|
||
print("Checking for 'Show Upcoming Supports' checkbox...")
|
||
checkbox_ticked = page.evaluate("""
|
||
() => {
|
||
// Look for checkbox or toggle related to "upcoming" or "future" supports
|
||
const labels = Array.from(document.querySelectorAll('label, span, div'));
|
||
const checkboxLabel = labels.find(el =>
|
||
el.textContent.toLowerCase().includes('upcoming') &&
|
||
el.textContent.toLowerCase().includes('support')
|
||
);
|
||
|
||
if (checkboxLabel) {
|
||
// Try to find associated checkbox/input
|
||
let checkbox = checkboxLabel.querySelector('input[type="checkbox"]');
|
||
if (!checkbox) {
|
||
// Look for checkbox nearby
|
||
const parent = checkboxLabel.closest('div, label');
|
||
if (parent) {
|
||
checkbox = parent.querySelector('input[type="checkbox"]');
|
||
}
|
||
}
|
||
|
||
if (checkbox && !checkbox.checked) {
|
||
checkbox.click();
|
||
return true;
|
||
}
|
||
}
|
||
|
||
// Alternative: Look for any checkbox with "upcoming" in nearby text
|
||
const checkboxes = Array.from(document.querySelectorAll('input[type="checkbox"]'));
|
||
for (const cb of checkboxes) {
|
||
const text = cb.closest('label, div')?.textContent?.toLowerCase() || '';
|
||
if (text.includes('upcoming') && text.includes('support') && !cb.checked) {
|
||
cb.click();
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
""")
|
||
|
||
if checkbox_ticked:
|
||
print(" ✓ Ticked 'Show Upcoming Supports' checkbox")
|
||
page.wait_for_timeout(1000) # Wait for page to update
|
||
else:
|
||
print(" ℹ 'Show Upcoming Supports' checkbox not found or already ticked")
|
||
|
||
# Scroll to load all cards (lazy loading) - more scrolls for complete list
|
||
print("Scrolling to load all cards...")
|
||
for i in range(30):
|
||
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||
page.wait_for_timeout(400)
|
||
|
||
# Scroll back up and down to ensure all loaded
|
||
page.evaluate("window.scrollTo(0, 0)")
|
||
page.wait_for_timeout(500)
|
||
for i in range(10):
|
||
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||
page.wait_for_timeout(300)
|
||
|
||
# Extract all card links
|
||
links = page.evaluate("""
|
||
() => {
|
||
const links = Array.from(document.querySelectorAll('a[href*="/umamusume/supports/"]'))
|
||
.map(a => a.href)
|
||
.filter(href => href.match(/\\/supports\\/\\d+-/));
|
||
return [...new Set(links)];
|
||
}
|
||
""")
|
||
|
||
print(f"Found {len(links)} support cards")
|
||
return sorted(links)
|
||
|
||
def parse_rarity_from_image(img_src):
|
||
"""Extract rarity from image source URL"""
|
||
if not img_src:
|
||
return "R" # Default to R if not found
|
||
for key, rarity in RARITY_MAP.items():
|
||
if key in img_src:
|
||
return rarity
|
||
return "R" # Default to R
|
||
|
||
def parse_type_from_image(img_src):
|
||
"""Extract type from image source URL"""
|
||
if not img_src:
|
||
return "Unknown"
|
||
for key, card_type in TYPE_MAP.items():
|
||
if key in img_src:
|
||
return card_type
|
||
return "Unknown"
|
||
|
||
def get_max_level_for_rarity(rarity):
|
||
"""Get maximum level based on rarity"""
|
||
if rarity == "SSR":
|
||
return 50
|
||
elif rarity == "SR":
|
||
return 45
|
||
else: # R
|
||
return 40
|
||
|
||
def extract_stable_id_from_url(url):
|
||
"""Extract stable numeric ID from GameTora URL (e.g., 30022 from /supports/30022-mejiro-mcqueen)"""
|
||
match = re.search(r'/supports/(\d+)-', url)
|
||
if match:
|
||
return match.group(1)
|
||
return None
|
||
|
||
def download_card_image(page, stable_id, card_name):
|
||
"""Download the card's character art image"""
|
||
os.makedirs(IMAGES_PATH, exist_ok=True)
|
||
|
||
try:
|
||
# Find the main card image
|
||
img_url = page.evaluate("""
|
||
() => {
|
||
// Look for the main card image - usually a large character portrait
|
||
const imgs = Array.from(document.querySelectorAll('img'));
|
||
|
||
// Find images that might be the card art (usually larger images with character names)
|
||
const cardImg = imgs.find(img =>
|
||
img.src.includes('/supports/') ||
|
||
img.src.includes('/cards/') ||
|
||
(img.width > 100 && img.height > 100 && img.src.includes('umamusume'))
|
||
);
|
||
|
||
// Also look for images in the infobox
|
||
const infoboxImg = document.querySelector('[class*="infobox"] img');
|
||
|
||
return cardImg ? cardImg.src : (infoboxImg ? infoboxImg.src : null);
|
||
}
|
||
""")
|
||
|
||
if img_url:
|
||
# Clean filename - use stable ID from URL instead of card_id
|
||
safe_name = re.sub(r'[<>:"/\\|?*]', '_', card_name)
|
||
if stable_id:
|
||
file_path = os.path.join(IMAGES_PATH, f"{stable_id}_{safe_name}.png")
|
||
else:
|
||
# Fallback to name-only if no stable ID (shouldn't happen)
|
||
file_path = os.path.join(IMAGES_PATH, f"{safe_name}.png")
|
||
|
||
# Skip if already exists
|
||
if os.path.exists(file_path):
|
||
# print(f" Art already exists, skipping download")
|
||
return file_path
|
||
|
||
# Download image
|
||
response = requests.get(img_url, timeout=10)
|
||
if response.status_code == 200:
|
||
with open(file_path, 'wb') as f:
|
||
f.write(response.content)
|
||
return file_path
|
||
except Exception as e:
|
||
print(f" Warning: Could not download image: {e}")
|
||
|
||
return None
|
||
|
||
def scrape_support_card(page, url, conn, max_retries=3):
|
||
"""Scrape a single support card with key levels and retries"""
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
page.goto(url, timeout=60000)
|
||
page.wait_for_load_state("networkidle")
|
||
page.wait_for_timeout(2000) # Extra wait for JS rendering
|
||
|
||
# Extract basic card info including type
|
||
card_data = page.evaluate("""
|
||
() => {
|
||
const h1 = document.querySelector('h1');
|
||
const title = h1 ? h1.textContent.trim() : '';
|
||
|
||
// Find rarity image
|
||
const imgs = Array.from(document.querySelectorAll('img'));
|
||
const rarityImg = imgs.find(i => i.src.includes('rarity'));
|
||
|
||
// Find type image - uses utx_ico_obtain pattern
|
||
const typeImg = imgs.find(i => i.src.includes('obtain_0'));
|
||
|
||
return {
|
||
title: title,
|
||
rarityImgSrc: rarityImg ? rarityImg.src : null,
|
||
typeImgSrc: typeImg ? typeImg.src : null
|
||
};
|
||
}
|
||
""")
|
||
|
||
# Parse name from title (remove rarity and "Support Card")
|
||
full_title = card_data['title']
|
||
name = re.sub(r'\s*\(SSR\)|\s*\(SR\)|\s*\(R\)', '', full_title)
|
||
name = name.replace('Support Card', '').strip()
|
||
|
||
if not name:
|
||
print(f" Warning: Empty name, skipping")
|
||
return False
|
||
|
||
rarity = parse_rarity_from_image(card_data['rarityImgSrc'])
|
||
card_type = parse_type_from_image(card_data['typeImgSrc'])
|
||
max_level = get_max_level_for_rarity(rarity)
|
||
|
||
print(f"Scraping: {name} | {rarity} | {card_type} | Max Level: {max_level}")
|
||
|
||
cur = conn.cursor()
|
||
|
||
# Insert card using OR IGNORE to keep the same card_id if it exists
|
||
cur.execute("""
|
||
INSERT OR IGNORE INTO support_cards (name, rarity, card_type, max_level, gametora_url)
|
||
VALUES (?, ?, ?, ?, ?)
|
||
""", (name, rarity, card_type, max_level, url))
|
||
|
||
# Update existing card to ensure data is fresh (without changing ID)
|
||
cur.execute("""
|
||
UPDATE support_cards
|
||
SET name = ?, rarity = ?, card_type = ?, max_level = ?
|
||
WHERE gametora_url = ?
|
||
""", (name, rarity, card_type, max_level, url))
|
||
conn.commit()
|
||
|
||
cur.execute("SELECT card_id FROM support_cards WHERE gametora_url = ?", (url,))
|
||
card_id = cur.fetchone()[0]
|
||
|
||
# Extract stable ID from URL for image filename
|
||
stable_id = extract_stable_id_from_url(url)
|
||
|
||
# Download character art using stable ID
|
||
image_path = download_card_image(page, stable_id, name)
|
||
if image_path:
|
||
cur.execute("UPDATE support_cards SET image_path = ? WHERE card_id = ?", (image_path, card_id))
|
||
conn.commit()
|
||
|
||
# Clear existing effects for this card (in case of re-scrape)
|
||
cur.execute("DELETE FROM support_effects WHERE card_id = ?", (card_id,))
|
||
cur.execute("DELETE FROM support_hints WHERE card_id = ?", (card_id,))
|
||
cur.execute("DELETE FROM event_skills WHERE event_id IN (SELECT event_id FROM support_events WHERE card_id = ?)", (card_id,))
|
||
cur.execute("DELETE FROM support_events WHERE card_id = ?", (card_id,))
|
||
|
||
# Scrape effects at key levels only
|
||
scrape_effects_key_levels(page, card_id, max_level, cur)
|
||
|
||
# Scrape hints
|
||
scrape_hints(page, card_id, cur)
|
||
|
||
# Scrape events
|
||
scrape_events(page, card_id, cur)
|
||
|
||
conn.commit()
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f" Attempt {attempt + 1} failed: {e}")
|
||
if attempt < max_retries - 1:
|
||
print(f" Retrying...")
|
||
time.sleep(1)
|
||
else:
|
||
print(f" All retries failed for {url}")
|
||
return False
|
||
|
||
def set_level(page, target_level):
|
||
"""Set the page to a specific level using JavaScript to click buttons"""
|
||
|
||
# Use JavaScript to handle all the clicking - need to check children.length === 0
|
||
# and wait for level text to change after each click
|
||
actual_level = page.evaluate("""
|
||
async (targetLevel) => {
|
||
// Get current level from the page
|
||
const getLevel = () => {
|
||
const el = Array.from(document.querySelectorAll('div')).find(d =>
|
||
d.textContent.trim().startsWith('Level ') && d.children.length === 0
|
||
);
|
||
if (!el) {
|
||
const text = document.body.innerText;
|
||
const match = text.match(/Level\\s*(\\d+)/i);
|
||
return match ? parseInt(match[1]) : 30;
|
||
}
|
||
return parseInt(el.textContent.replace('Level ', '').trim());
|
||
};
|
||
|
||
// Find a button by its exact text - MUST check children.length === 0
|
||
const clickButton = (text) => {
|
||
const btns = Array.from(document.querySelectorAll('div'));
|
||
const btn = btns.find(d => d.textContent.trim() === text && d.children.length === 0);
|
||
if (btn) {
|
||
btn.click();
|
||
return true;
|
||
}
|
||
return false;
|
||
};
|
||
|
||
let currentLevel = getLevel();
|
||
|
||
// Navigate to target level
|
||
while (currentLevel !== targetLevel) {
|
||
let btnText;
|
||
if (currentLevel > targetLevel) {
|
||
const diff = currentLevel - targetLevel;
|
||
btnText = diff >= 5 ? '-5' : '-1';
|
||
} else {
|
||
const diff = targetLevel - currentLevel;
|
||
btnText = diff >= 5 ? '+5' : '+1';
|
||
}
|
||
|
||
if (!clickButton(btnText)) {
|
||
// Button not found, we might be at max/min level
|
||
break;
|
||
}
|
||
|
||
// CRITICAL: Wait for level text to actually change
|
||
const startLevel = currentLevel;
|
||
let start = Date.now();
|
||
while (Date.now() - start < 1000) {
|
||
await new Promise(r => setTimeout(r, 50));
|
||
const newLevel = getLevel();
|
||
if (newLevel !== startLevel) {
|
||
currentLevel = newLevel;
|
||
break;
|
||
}
|
||
}
|
||
|
||
// If we're stuck, break out
|
||
if (currentLevel === startLevel) {
|
||
break;
|
||
}
|
||
}
|
||
|
||
// Final wait for effects to update
|
||
await new Promise(r => setTimeout(r, 200));
|
||
return getLevel();
|
||
}
|
||
""", target_level)
|
||
|
||
return actual_level
|
||
|
||
def extract_effects(page):
|
||
"""Extract effects from current page state using proper DOM selectors"""
|
||
effects = page.evaluate("""
|
||
() => {
|
||
const effects = [];
|
||
|
||
// Method 1: Try to find effects using the specific class structure
|
||
const effectContainers = document.querySelectorAll('[class*="effect__"]');
|
||
|
||
effectContainers.forEach(container => {
|
||
const text = container.innerText.trim();
|
||
if (text.includes('Unlocked at level')) return;
|
||
|
||
const fullText = text.split('\\n').join(' ');
|
||
|
||
const patterns = [
|
||
|
||
// Basic Stats
|
||
{ regex: /Speed Bonus\\s*(\\d+)/, name: 'Speed Bonus' },
|
||
{ regex: /Stamina Bonus\\s*(\\d+)/, name: 'Stamina Bonus' },
|
||
{ regex: /Power Bonus\\s*(\\d+)/, name: 'Power Bonus' },
|
||
{ regex: /Guts Bonus\\s*(\\d+)/, name: 'Guts Bonus' },
|
||
{ regex: /Wisdom Bonus\\s*(\\d+)/, name: 'Wisdom Bonus' },
|
||
{ regex: /Wit Bonus\\s*(\\d+)/, name: 'Wisdom Bonus' }, // Alias Wit -> Wisdom
|
||
{ regex: /Skill Pts Bonus\\s*(\\d+)/, name: 'Skill Pts Bonus' },
|
||
|
||
// Initial Stats
|
||
{ regex: /Initial Speed\\s*(\\d+)/, name: 'Initial Speed' },
|
||
{ regex: /Initial Stamina\\s*(\\d+)/, name: 'Initial Stamina' },
|
||
{ regex: /Initial Power\\s*(\\d+)/, name: 'Initial Power' },
|
||
{ regex: /Initial Guts\\s*(\\d+)/, name: 'Initial Guts' },
|
||
{ regex: /Initial Wisdom\\s*(\\d+)/, name: 'Initial Wisdom' },
|
||
{ regex: /Initial Wit\\s*(\\d+)/, name: 'Initial Wisdom' }, // Alias Wit -> Wisdom
|
||
|
||
// Special Bonuses
|
||
{ regex: /Friendship Bonus\\s*(\\d+%?)/, name: 'Friendship Bonus' },
|
||
{ regex: /Mood Effect\\s*(\\d+%?)/, name: 'Mood Effect' },
|
||
{ regex: /Motivation Effect\\s*(\\d+%?)/, name: 'Motivation Effect' },
|
||
{ regex: /Training Effectiveness\\s*(\\d+%?)/, name: 'Training Effectiveness' },
|
||
{ regex: /Race Bonus\\s*(\\d+%?)/, name: 'Race Bonus' },
|
||
{ regex: /Fan Bonus\\s*(\\d+%?)/, name: 'Fan Bonus' },
|
||
|
||
// Hints
|
||
{ regex: /Hint Rate\\s*(\\d+%?)/, name: 'Hint Rate' },
|
||
{ regex: /Hint Frequency\\s*(\\d+%?)/, name: 'Hint Rate' }, // Alias Frequency -> Rate
|
||
{ regex: /Hint Lv Up\\s*(\\d+%?)/, name: 'Hint Lv Up' },
|
||
{ regex: /Hint Levels\\s*Lv\\s*(\\d+)/, name: 'Hint Lv Up' }, // Alias Hint Levels -> Hint Lv Up
|
||
|
||
// Specialty/Bond
|
||
{ regex: /Starting Bond\\s*(\\d+)/, name: 'Starting Bond' },
|
||
{ regex: /Initial Friendship Gauge\\s*(\\d+)/, name: 'Starting Bond' }, // Alias -> Starting Bond
|
||
{ regex: /Specialty Rate\\s*(\\d+%?)/, name: 'Specialty Rate' },
|
||
{ regex: /Specialty Priority\\s*(\\d+)/, name: 'Specialty Rate' }, // Alias Priority -> Rate (usually same concept)
|
||
|
||
// Recovery/Usage
|
||
{ regex: /Race Status\\s*(\\d+)/, name: 'Race Status' },
|
||
{ regex: /Energy Discount\\s*(\\d+%?)/, name: 'Energy Discount' },
|
||
{ regex: /Wit Friendship Recovery\\s*(\\d+)/, name: 'Wisdom Friendship Recovery' },
|
||
|
||
// Catch-all Unique
|
||
{ regex: /Unique Effect\\s*(.*)/, name: 'Unique Effect' },
|
||
];
|
||
|
||
|
||
for (const p of patterns) {
|
||
const match = fullText.match(p.regex);
|
||
if (match && !effects.some(e => e.name === p.name)) {
|
||
effects.push({ name: p.name, value: match[1] });
|
||
}
|
||
}
|
||
});
|
||
|
||
// Method 2: Fallback - scan entire page text
|
||
if (effects.length === 0) {
|
||
const bodyText = document.body.innerText;
|
||
const lines = bodyText.split('\\n');
|
||
|
||
const simplePatterns = [
|
||
/^(Friendship Bonus)\\s*(\\d+%?)$/,
|
||
/^(Mood Effect)\\s*(\\d+%?)$/,
|
||
/^(Race Bonus)\\s*(\\d+%?)$/,
|
||
/^(Fan Bonus)\\s*(\\d+%?)$/,
|
||
/^(Training Effectiveness)\\s*(\\d+%?)$/,
|
||
];
|
||
|
||
for (const line of lines) {
|
||
const trimmed = line.trim();
|
||
for (const pattern of simplePatterns) {
|
||
const match = trimmed.match(pattern);
|
||
if (match && !effects.some(e => e.name === match[1])) {
|
||
effects.push({ name: match[1], value: match[2] });
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return effects;
|
||
}
|
||
""")
|
||
return effects
|
||
|
||
def scrape_effects_key_levels(page, card_id, max_level, cur):
|
||
"""Scrape effects at key levels based on card rarity/max level"""
|
||
|
||
# Determine which levels to scrape based on max_level (rarity)
|
||
if max_level == 50: # SSR
|
||
levels_to_scrape = [30, 35, 40, 45, 50]
|
||
elif max_level == 45: # SR
|
||
levels_to_scrape = [25, 30, 35, 40, 45]
|
||
else: # R (max 40)
|
||
levels_to_scrape = [20, 25, 30, 35, 40]
|
||
|
||
# Filter out any that might exceed max somehow (safety check)
|
||
levels_to_scrape = [l for l in levels_to_scrape if l <= max_level]
|
||
|
||
for level in levels_to_scrape:
|
||
actual_level = set_level(page, level)
|
||
effects = extract_effects(page)
|
||
|
||
for effect in effects:
|
||
cur.execute("""
|
||
INSERT INTO support_effects (card_id, level, effect_name, effect_value)
|
||
VALUES (?, ?, ?, ?)
|
||
""", (card_id, actual_level, effect['name'], effect['value']))
|
||
|
||
print(f" Level {actual_level}: {len(effects)} effects")
|
||
|
||
def scrape_hints(page, card_id, cur):
|
||
"""Scrape support hints/training skills"""
|
||
hints = page.evaluate("""
|
||
() => {
|
||
const hints = [];
|
||
const text = document.body.innerText;
|
||
|
||
const hintsMatch = text.match(/Support [Hh]ints([\\s\\S]*?)(?:Training [Ee]vents|Skills from [Ee]vents|$)/);
|
||
if (!hintsMatch) return hints;
|
||
|
||
const hintsSection = hintsMatch[1];
|
||
const lines = hintsSection.split('\\n');
|
||
|
||
for (const line of lines) {
|
||
const trimmed = line.trim();
|
||
if (trimmed.length > 3 && trimmed.length < 60 &&
|
||
!trimmed.includes('Lv') && !trimmed.includes('%') &&
|
||
!trimmed.includes('Details') && trimmed[0] === trimmed[0].toUpperCase()) {
|
||
hints.push({ name: trimmed, description: '' });
|
||
}
|
||
}
|
||
|
||
return hints.slice(0, 10);
|
||
}
|
||
""")
|
||
|
||
for hint in hints:
|
||
cur.execute("""
|
||
INSERT INTO support_hints (card_id, hint_name, hint_description)
|
||
VALUES (?, ?, ?)
|
||
""", (card_id, hint.get('name', ''), hint.get('description', '')))
|
||
|
||
if hints:
|
||
print(f" Found {len(hints)} hints")
|
||
|
||
def scrape_events(page, card_id, cur):
|
||
"""Scrape the LAST chain event (Golden Perk) with OR options"""
|
||
|
||
# Use a flag to avoid adding multiple console listeners
|
||
if not hasattr(page, "_console_attached"):
|
||
page.on("console", lambda msg: print(f" [JS Console] {msg.text}") if "scrapping" not in msg.text.lower() else None)
|
||
page._console_attached = True
|
||
|
||
# 1. First, build a map of skills from the 'Skills from events' summary section
|
||
# This helps us identify which skills are Rare (Gold)
|
||
skill_rarity_map = page.evaluate("""
|
||
() => {
|
||
const map = {};
|
||
console.log("Building Skill Rarity Map...");
|
||
|
||
// 1. Find all skill containers. They usually have a name and a 'Details' button.
|
||
// In the "Skills from events" or "Support hints" sections.
|
||
const containers = Array.from(document.querySelectorAll('div')).filter(d =>
|
||
(d.innerText.includes('Details') || d.innerText.includes('Reward')) && d.innerText.length < 500
|
||
);
|
||
|
||
containers.forEach(c => {
|
||
// Try to extract the skill name. It's usually the first text node or a bold tag.
|
||
const nameNode = c.querySelector('b, span[font-weight="bold"], div[font-weight="bold"]');
|
||
let name = "";
|
||
if (nameNode) {
|
||
name = nameNode.innerText.trim();
|
||
} else {
|
||
// Fallback to text before 'Details'
|
||
name = c.innerText.split('Details')[0].replace(/\\n/g, ' ').trim();
|
||
}
|
||
|
||
if (name && name.length > 2) {
|
||
const style = window.getComputedStyle(c);
|
||
const nameStyle = nameNode ? window.getComputedStyle(nameNode) : null;
|
||
|
||
// Golden skills have a specific background
|
||
const isGold = style.backgroundImage.includes('linear-gradient') ||
|
||
style.backgroundColor.includes('rgb(255, 193, 7)') ||
|
||
(nameStyle && nameStyle.color === 'rgb(255, 193, 7)') ||
|
||
c.className.includes('kkspcu') ||
|
||
c.innerHTML.includes('kkspcu');
|
||
|
||
const normalized = name.toLowerCase().replace(/\\s+/g, ' ').replace(/[()()-]/g, '').trim();
|
||
map[normalized] = isGold;
|
||
console.log(`Mapped Skill: "${name}" [${normalized}] -> Gold: ${isGold}`);
|
||
}
|
||
});
|
||
return map;
|
||
}
|
||
""")
|
||
|
||
# Scroll to the Events section specifically
|
||
print(" Ensuring events are loaded...")
|
||
page.evaluate("() => { const h = Array.from(document.querySelectorAll('h2, h1, div')).find(el => el.innerText.toLowerCase().includes('training events')); if (h) h.scrollIntoView(); }")
|
||
page.wait_for_timeout(1000)
|
||
|
||
# 2. Scrape ONLY the LAST chain event (Golden Perk) with OR options
|
||
golden_perk_data = page.evaluate("""
|
||
async () => {
|
||
console.log("Scraping Golden Perk (last chain event)...");
|
||
|
||
// Find all chain event buttons
|
||
const getChainEventButtons = () => {
|
||
const buttons = [];
|
||
// Look for "Chain Events" text (case-insensitive substring)
|
||
const labels = Array.from(document.querySelectorAll('div, span, h2, h3, h4')).filter(el =>
|
||
el.innerText.toLowerCase().includes('chain events') && el.innerText.trim().length < 20
|
||
);
|
||
|
||
labels.forEach(label => {
|
||
// The buttons are usually in the same container or next container
|
||
let container = label.parentElement;
|
||
let attempts = 0;
|
||
while (container && container.querySelectorAll('button').length === 0 && attempts < 5) {
|
||
container = container.nextElementSibling || container.parentElement;
|
||
attempts++;
|
||
if (container && container.tagName === 'BODY') break;
|
||
}
|
||
|
||
if (container) {
|
||
const btns = Array.from(container.querySelectorAll('button'));
|
||
btns.forEach(btn => {
|
||
const text = btn.innerText.trim();
|
||
const style = window.getComputedStyle(btn);
|
||
const isVisible = style.display !== 'none' && style.visibility !== 'hidden';
|
||
|
||
// Look for arrows (regular or heavy)
|
||
if (isVisible && (text.includes('>') || text.includes('❯'))) {
|
||
buttons.push(btn);
|
||
}
|
||
});
|
||
}
|
||
});
|
||
return buttons;
|
||
};
|
||
|
||
const buttons = getChainEventButtons();
|
||
console.log(`Found ${buttons.length} chain event buttons`);
|
||
|
||
if (buttons.length === 0) {
|
||
return null;
|
||
}
|
||
|
||
let goldenPerkButton = null;
|
||
let maxArrows = 0;
|
||
|
||
for (const btn of buttons) {
|
||
const text = btn.innerText.trim();
|
||
// Count both regular and heavy arrows
|
||
const arrowCount = (text.match(/>|❯/g) || []).length;
|
||
|
||
// If it has three heavy arrows, it's almost certainly the golden perk
|
||
if (text.includes('❯❯❯')) {
|
||
goldenPerkButton = btn;
|
||
break;
|
||
}
|
||
|
||
if (arrowCount > maxArrows) {
|
||
maxArrows = arrowCount;
|
||
goldenPerkButton = btn;
|
||
}
|
||
}
|
||
|
||
if (!goldenPerkButton) {
|
||
console.log("No golden perk button found");
|
||
return null;
|
||
}
|
||
|
||
const eventName = goldenPerkButton.innerText.trim();
|
||
console.log(`Found Golden Perk: ${eventName} (${maxArrows} arrows)`);
|
||
|
||
try {
|
||
// Click to open popover
|
||
goldenPerkButton.scrollIntoViewIfNeeded ? goldenPerkButton.scrollIntoViewIfNeeded() : null;
|
||
await new Promise(r => setTimeout(r, 100));
|
||
goldenPerkButton.click();
|
||
await new Promise(r => setTimeout(r, 600));
|
||
|
||
// Find popover
|
||
const popovers = Array.from(document.querySelectorAll('div')).filter(d =>
|
||
d.innerText.includes(eventName) &&
|
||
window.getComputedStyle(d).zIndex > 50 &&
|
||
d.innerText.length < 2500
|
||
);
|
||
|
||
if (popovers.length === 0) {
|
||
console.log(`Popover NOT found for ${eventName}`);
|
||
document.body.click();
|
||
return { name: eventName, type: 'Chain', skills: [] };
|
||
}
|
||
|
||
const pop = popovers[popovers.length - 1];
|
||
console.log(`Found popover for ${eventName}`);
|
||
|
||
// Check for OR structure - look for "Randomly either" or "or" divider
|
||
const hasOrDivider = pop.querySelector('[class*="divider_or"]') !== null ||
|
||
pop.innerText.includes('Randomly either') ||
|
||
pop.innerText.toLowerCase().includes(' or ');
|
||
|
||
// Find all skill names (purple/blue links)
|
||
const skillLinks = Array.from(pop.querySelectorAll('span, a')).filter(el =>
|
||
el.innerText.length > 2 &&
|
||
!el.innerText.includes('Energy') &&
|
||
!el.innerText.includes('bond') &&
|
||
(window.getComputedStyle(el).color === 'rgb(102, 107, 255)' ||
|
||
el.className.includes('linkcolor'))
|
||
);
|
||
|
||
console.log(`Found ${skillLinks.length} potential skills in popover`);
|
||
|
||
const skills = [];
|
||
skillLinks.forEach(link => {
|
||
const skillName = link.innerText.trim();
|
||
if (skillName && skillName.length > 2 && !skills.some(s => s.name === skillName)) {
|
||
// If there's an OR divider, all skills in this popover are part of OR groups
|
||
const isOr = hasOrDivider;
|
||
skills.push({ name: skillName, is_or: isOr });
|
||
}
|
||
});
|
||
|
||
// Close popover
|
||
document.body.click();
|
||
await new Promise(r => setTimeout(r, 200));
|
||
|
||
return { name: eventName, type: 'Chain', skills: skills };
|
||
|
||
} catch (err) {
|
||
console.log(`Error clicking ${eventName}: ${err.message}`);
|
||
return { name: eventName, type: 'Chain', skills: [] };
|
||
}
|
||
}
|
||
""")
|
||
|
||
# 3. Store ONLY the golden perk in database
|
||
if golden_perk_data:
|
||
cur.execute("""
|
||
INSERT INTO support_events (card_id, event_name, event_type)
|
||
VALUES (?, ?, ?)
|
||
""", (card_id, golden_perk_data['name'], golden_perk_data['type']))
|
||
event_id = cur.lastrowid
|
||
|
||
for skill in golden_perk_data['skills']:
|
||
# Normalization helper
|
||
def normalize(s):
|
||
return s.lower().replace(" hint +1", "").replace(" hint +3", "").replace(" hint +5", "").replace(" hint +", "").strip().replace(" ", " ").replace("-", "").replace("(", "").replace(")", "").replace(" ", "")
|
||
|
||
skill_name = normalize(skill['name'])
|
||
|
||
# Use extra aggressive name matching against the map values
|
||
# (The map keys are already normalized)
|
||
is_gold = 0
|
||
for k, gold in skill_rarity_map.items():
|
||
if normalize(k) == skill_name:
|
||
is_gold = 1 if gold else 0
|
||
break
|
||
|
||
# Fallback 1: If it's a chain event and specifically the last one, it's almost certainly gold
|
||
if not is_gold and golden_perk_data.get('type') == 'Chain':
|
||
# Check for "hint" patterns which usually accompany gold perks in chain events
|
||
if "hint +" in skill['name'].lower() or len(golden_perk_data['skills']) <= 2:
|
||
is_gold = 1
|
||
print(f" ✨ Golden Skill Fallback (Last Chain Event): {skill['name']}")
|
||
|
||
if is_gold:
|
||
print(f" ✨ Golden Skill Verified: {skill['name']}")
|
||
|
||
cur.execute("""
|
||
INSERT INTO event_skills (event_id, skill_name, is_gold, is_or)
|
||
VALUES (?, ?, ?, ?)
|
||
""", (event_id, skill['name'], is_gold, 1 if skill['is_or'] else 0))
|
||
|
||
skill_count = len(golden_perk_data['skills'])
|
||
or_count = sum(1 for s in golden_perk_data['skills'] if s['is_or'])
|
||
print(f" Golden Perk: {golden_perk_data['name']} ({skill_count} skills, {or_count} with OR)")
|
||
else:
|
||
print(f" No Golden Perk found for this card")
|
||
|
||
def run_scraper():
|
||
""" Run the web scraper to fetch card data from GameTora.com """
|
||
print("=" * 60)
|
||
print("GameTora Umamusume Support Card Scraper")
|
||
print(f"Scraping effects at levels: {KEY_LEVELS}")
|
||
print("=" * 60)
|
||
|
||
# Initialize fresh database
|
||
print("\nInitializing database...")
|
||
init_database()
|
||
|
||
# Create images directory
|
||
os.makedirs(IMAGES_PATH, exist_ok=True)
|
||
|
||
conn = get_conn()
|
||
|
||
with sync_playwright() as p:
|
||
print("\nLaunching browser...")
|
||
browser = p.chromium.launch(headless=True)
|
||
context = browser.new_context()
|
||
page = context.new_page()
|
||
|
||
# Get all card links
|
||
links = scrape_all_support_links(page)
|
||
|
||
print(f"\nStarting to scrape {len(links)} cards...")
|
||
print("Including character art download.")
|
||
print("This will take approximately 90-120 minutes.\n")
|
||
|
||
success_count = 0
|
||
fail_count = 0
|
||
|
||
for i, url in enumerate(links, 1):
|
||
print(f"\n[{i}/{len(links)}] ", end="")
|
||
if scrape_support_card(page, url, conn):
|
||
success_count += 1
|
||
else:
|
||
fail_count += 1
|
||
|
||
time.sleep(0.3)
|
||
|
||
if i % 50 == 0:
|
||
print(f"\n{'='*40}")
|
||
print(f"Progress: {i}/{len(links)} ({i*100//len(links)}%)")
|
||
print(f"Success: {success_count}, Failed: {fail_count}")
|
||
print(f"{'='*40}")
|
||
|
||
browser.close()
|
||
|
||
conn.close()
|
||
|
||
print("\n" + "=" * 60)
|
||
print("Scraping Complete!")
|
||
print(f"Successfully scraped: {success_count} cards")
|
||
print(f"Failed: {fail_count} cards")
|
||
print(f"Images saved to: {IMAGES_PATH}")
|
||
print("=" * 60)
|
||
|
||
if __name__ == "__main__":
|
||
run_scraper()
|