From 9700d0bd9a90839083f939ec83bb00d614808c6c Mon Sep 17 00:00:00 2001 From: josh Date: Sat, 4 Apr 2026 13:49:49 -0400 Subject: [PATCH] feat: RCDB-backed roller coaster filter with fuzzy name matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add lib/park-meta.ts to manage data/park-meta.json (rcdb_id + coaster lists) - Add lib/scrapers/rcdb.ts to scrape operating coaster names from RCDB park pages - discover.ts now seeds park-meta.json with skeleton entries for all parks - scrape.ts now refreshes RCDB coaster lists (30-day staleness) for parks with rcdb_id set - fetchLiveRides() accepts a coasterNames Set; isCoaster uses normalize() on both sides to handle trademark symbols, 'THE ' prefixes, and punctuation differences between Queue-Times and RCDB names — applies correctly to both land rides and top-level rides - Commit park-meta.json so it ships in the Docker image (fresh volumes get it automatically) - Update .gitignore / .dockerignore to exclude only *.db files, not all of data/ - Dockerfile copies park-meta.json into image before VOLUME declaration - README: document coaster filter setup and correct staleness window (72h not 7d) Co-Authored-By: Claude Sonnet 4.6 --- .dockerignore | 7 +- .gitignore | 4 +- Dockerfile | 7 +- README.md | 11 +- app/park/[id]/page.tsx | 6 +- data/park-meta.json | 416 +++++++++++++++++++++++++++++++++++++ lib/park-meta.ts | 86 ++++++++ lib/scrapers/queuetimes.ts | 28 ++- lib/scrapers/rcdb.ts | 91 ++++++++ scripts/discover.ts | 31 ++- scripts/scrape.ts | 38 ++++ 11 files changed, 710 insertions(+), 15 deletions(-) create mode 100644 data/park-meta.json create mode 100644 lib/park-meta.ts create mode 100644 lib/scrapers/rcdb.ts diff --git a/.dockerignore b/.dockerignore index 2b842da..e825df4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,10 +2,9 @@ .gitea .next node_modules -data/ -*.db -*.db-shm -*.db-wal +data/*.db +data/*.db-shm +data/*.db-wal .env* npm-debug.log* .DS_Store diff --git a/.gitignore b/.gitignore index 356ffd9..da23c4e 100644 --- a/.gitignore +++ b/.gitignore @@ -27,7 +27,9 @@ yarn-debug.log* yarn-error.log* # scraped data — local only, not committed -/data/ +/data/*.db +/data/*.db-shm +/data/*.db-wal # env files .env* diff --git a/Dockerfile b/Dockerfile index e1b175c..a61405e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,8 +44,13 @@ COPY --from=deps --chown=nextjs:nodejs /app/node_modules ./node_modules RUN npx playwright install --with-deps chromium && \ chown -R nextjs:nodejs /app/.playwright -# SQLite data directory — mount a named volume here for persistence +# Seed data directory with park-meta.json (RCDB coaster lists + rcdb_id mappings). +# Must be copied before VOLUME so Docker initialises a fresh named volume with +# this file already present. Existing volumes retain their own copy. RUN mkdir -p /app/data && chown nextjs:nodejs /app/data +COPY --from=builder --chown=nextjs:nodejs /app/data/park-meta.json ./data/park-meta.json + +# SQLite database lives here — mount a named volume for persistence VOLUME ["/app/data"] USER nextjs diff --git a/README.md b/README.md index f08769c..da9e9d1 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,13 @@ The park detail page shows ride open/closed status using a two-tier approach: 2. **Schedule fallback (Six Flags API)** — the Six Flags operating-hours API drops the current day from its response once a park opens. When Queue-Times data is unavailable, the app falls back to the nearest upcoming date from the Six Flags schedule API as an approximation. +### Roller Coaster Filter + +When live data is shown, a **Coasters only** toggle appears if roller coaster data has been populated for that park. Coaster lists are sourced from [RCDB](https://rcdb.com) and stored in `data/park-meta.json`. To populate them: + +1. Open `data/park-meta.json` and set `rcdb_id` for each park to the numeric RCDB park ID (visible in the URL: `https://rcdb.com/4529.htm` → `4529`). +2. Run `npm run scrape` — coaster lists are fetched from RCDB and stored in the JSON file. They refresh automatically every 30 days on subsequent scrapes. + --- ## Local Development @@ -111,4 +118,6 @@ docker run --rm -v sixflagssupercalendar_park_data:/app/data \ ## Data Refresh -The scraper skips any park + month combination scraped within the last 7 days. Run `npm run scrape` on a weekly schedule to keep data current. Parks or months not yet in the database show a `—` placeholder; parks with no open days in the displayed week are hidden from the calendar automatically. +The scraper skips any park + month already scraped within the last 72 hours. The nightly Docker scraper service handles this automatically. Parks or months not yet in the database show a `—` placeholder; parks with no open days in the displayed week are hidden from the calendar automatically. + +Roller coaster lists (from RCDB) are refreshed every 30 days on each `npm run scrape` run, for parks with a configured `rcdb_id`. diff --git a/app/park/[id]/page.tsx b/app/park/[id]/page.tsx index 8432647..af611ef 100644 --- a/app/park/[id]/page.tsx +++ b/app/park/[id]/page.tsx @@ -5,6 +5,7 @@ import { openDb, getParkMonthData, getApiId } from "@/lib/db"; import { scrapeRidesForDay } from "@/lib/scrapers/sixflags"; import { fetchLiveRides } from "@/lib/scrapers/queuetimes"; import { QUEUE_TIMES_IDS } from "@/lib/queue-times-map"; +import { readParkMeta, getCoasterSet } from "@/lib/park-meta"; import { ParkMonthCalendar } from "@/components/ParkMonthCalendar"; import { LiveRidePanel } from "@/components/LiveRidePanel"; import type { RideStatus, RidesFetchResult } from "@/lib/scrapers/sixflags"; @@ -46,11 +47,14 @@ export default async function ParkPage({ params, searchParams }: PageProps) { // ── Ride data: try live Queue-Times first, fall back to schedule ────────── const queueTimesId = QUEUE_TIMES_IDS[id]; + const parkMeta = readParkMeta(); + const coasterSet = getCoasterSet(id, parkMeta); + let liveRides: LiveRidesResult | null = null; let ridesResult: RidesFetchResult | null = null; if (queueTimesId) { - liveRides = await fetchLiveRides(queueTimesId); + liveRides = await fetchLiveRides(queueTimesId, coasterSet); } // Only hit the schedule API as a fallback when live data is unavailable diff --git a/data/park-meta.json b/data/park-meta.json new file mode 100644 index 0000000..a67b830 --- /dev/null +++ b/data/park-meta.json @@ -0,0 +1,416 @@ +{ + "greatadventure": { + "rcdb_id": 4534, + "coasters": [ + "Superman - Ultimate Flight", + "El Toro", + "Dark Knight", + "Joker", + "Jersey Devil Coaster", + "Lil' Devil Coaster", + "Flash: Vertical Velocity", + "Batman The Ride", + "Skull Mountain", + "Runaway Mine Train", + "Medusa", + "Harley Quinn Crazy Train", + "Nitro" + ], + "coasters_scraped_at": "2026-04-04T17:40:09.731Z" + }, + "magicmountain": { + "rcdb_id": 4532, + "coasters": [ + "Ninja", + "New Revolution", + "Batman The Ride", + "Viper", + "Gold Rusher", + "Riddler's Revenge", + "Canyon Blaster", + "Goliath", + "X2", + "Scream!", + "Tatsu", + "Apocalypse the Ride", + "Road Runner Express", + "Speedy Gonzales Hot Rod Racers", + "Full Throttle", + "Twisted Colossus", + "West Coast Racers", + "Wonder Woman Flight of Courage" + ], + "coasters_scraped_at": "2026-04-04T17:45:43.666Z" + }, + "greatamerica": { + "rcdb_id": 4530, + "coasters": [ + "Demon", + "Batman The Ride", + "American Eagle", + "Viper", + "Whizzer", + "Sprocket Rockets", + "Raging Bull", + "Flash: Vertical Velocity", + "Superman - Ultimate Flight", + "Dark Knight", + "Little Dipper", + "Goliath", + "X-Flight", + "Joker", + "Maxx Force", + "Wrath of Rakshasa" + ], + "coasters_scraped_at": "2026-04-04T17:29:24.092Z" + }, + "overgeorgia": { + "rcdb_id": 4535, + "coasters": [ + "Blue Hawk", + "Great American Scream Machine", + "Dahlonega Mine Train", + "Batman The Ride", + "Georgia Scorcher", + "Superman - Ultimate Flight", + "Joker Funhouse Coaster", + "Goliath", + "Dare Devil Dive", + "Twisted Cyclone", + "Riddler Mindbender", + "Georgia Gold Rusher" + ], + "coasters_scraped_at": "2026-04-04T17:29:26.121Z" + }, + "overtexas": { + "rcdb_id": 4531, + "coasters": [ + "Pandemonium", + "New Texas Giant", + "Joker", + "Aquaman: Power Wave", + "Shock Wave", + "Judge Roy Scream", + "Runaway Mine Train", + "Runaway Mountain", + "Mini Mine Train", + "Mr. Freeze", + "Batman The Ride", + "Titan", + "Wile E. Coyote's Grand Canyon Blaster" + ], + "coasters_scraped_at": "2026-04-04T17:45:45.715Z" + }, + "stlouis": { + "rcdb_id": 4536, + "coasters": [ + "Ninja", + "River King Mine Train", + "Mr. Freeze Reverse Blast", + "Batman The Ride", + "Screamin' Eagle", + "Boss", + "Pandemonium", + "American Thunder", + "Boomerang", + "Rookie Racer" + ], + "coasters_scraped_at": "2026-04-04T17:45:47.770Z" + }, + "fiestatexas": { + "rcdb_id": 4538, + "coasters": [ + "Batgirl Coaster Chase", + "Road Runner Express", + "Poltergeist", + "Boomerang Coast to Coaster", + "Superman Krypton Coaster", + "Pandemonium", + "Chupacabra", + "Iron Rattler", + "Batman The Ride", + "Wonder Woman Golden Lasso Coaster", + "Dr. Diabolical's Cliffhanger" + ], + "coasters_scraped_at": "2026-04-04T17:45:49.819Z" + }, + "newengland": { + "rcdb_id": 4565, + "coasters": [ + "Joker", + "Thunderbolt", + "Great Chase", + "Riddler Revenge", + "Superman the Ride", + "Flashback", + "Catwoman's Whip", + "Pandemonium", + "Batman - The Dark Knight", + "Wicked Cyclone", + "Gotham City Gauntlet Escape from Arkham Asylum" + ], + "coasters_scraped_at": "2026-04-04T17:45:51.866Z" + }, + "discoverykingdom": { + "rcdb_id": 4711, + "coasters": [ + "Roadrunner Express", + "Medusa", + "Cobra", + "Flash: Vertical Velocity", + "Kong", + "Boomerang", + "Superman Ultimate Flight", + "Joker", + "Batman The Ride", + "Sidewinder Safari" + ], + "coasters_scraped_at": "2026-04-04T17:45:53.909Z" + }, + "mexico": { + "rcdb_id": 4629, + "coasters": [ + "Tsunami", + "Superman Krypton Coaster", + "Batgirl Batarang", + "Batman The Ride", + "Superman el Último Escape", + "Dark Knight", + "Joker", + "Medusa Steel Coaster", + "Wonder Woman", + "Speedway Stunt Coaster" + ], + "coasters_scraped_at": "2026-04-04T17:45:55.963Z" + }, + "greatescape": { + "rcdb_id": 4596, + "coasters": [ + "Comet", + "Steamin' Demon", + "Flashback", + "Canyon Blaster", + "Frankie's Mine Train", + "Bobcat" + ], + "coasters_scraped_at": "2026-04-04T17:45:58.013Z" + }, + "darienlake": { + "rcdb_id": 4581, + "coasters": [ + "Predator", + "Viper", + "Mind Eraser", + "Boomerang", + "Ride of Steel", + "Hoot N Holler", + "Moto Coaster", + "Tantrum" + ], + "coasters_scraped_at": "2026-04-04T17:46:00.042Z" + }, + "cedarpoint": { + "rcdb_id": 4529, + "coasters": [ + "Raptor", + "Rougarou", + "Magnum XL-200", + "Blue Streak", + "Corkscrew", + "Gemini", + "Wilderness Run", + "Woodstock Express", + "Millennium Force", + "Iron Dragon", + "Cedar Creek Mine Ride", + "Maverick", + "GateKeeper", + "Valravn", + "Steel Vengeance", + "Top Thrill 2", + "Wild Mouse", + "Siren’s Curse" + ], + "coasters_scraped_at": "2026-04-04T17:46:02.082Z" + }, + "knotts": { + "rcdb_id": 4546, + "coasters": [ + "Jaguar!", + "GhostRider", + "Xcelerator", + "Silver Bullet", + "Sierra Sidewinder", + "Pony Express", + "Coast Rider", + "HangTime", + "Snoopy’s Tenderpaw Twister Coaster" + ], + "coasters_scraped_at": "2026-04-04T17:46:04.120Z" + }, + "canadaswonderland": { + "rcdb_id": 4539, + "coasters": [ + "Flight Deck", + "Dragon Fyre", + "Mighty Canadian Minebuster", + "Wilde Beast", + "Ghoster Coaster", + "Thunder Run", + "Bat", + "Vortex", + "Taxi Jam", + "Fly", + "Silver Streak", + "Backlot Stunt Coaster", + "Behemoth", + "Leviathan", + "Wonder Mountain's Guardian", + "Yukon Striker", + "Snoopy's Racing Railway", + "AlpenFury" + ], + "coasters_scraped_at": "2026-04-04T17:46:06.152Z" + }, + "carowinds": { + "rcdb_id": 4542, + "coasters": [ + "Carolina Cyclone", + "Woodstock Express", + "Carolina Goldrusher", + "Hurler", + "Vortex", + "Wilderness Run", + "Afterburn", + "Flying Cobras", + "Thunder Striker", + "Fury 325", + "Copperhead Strike", + "Snoopy’s Racing Railway", + "Ricochet", + "Kiddy Hawk" + ], + "coasters_scraped_at": "2026-04-04T17:46:08.185Z" + }, + "kingsdominion": { + "rcdb_id": 4544, + "coasters": [ + "Racer 75", + "Woodstock Express", + "Grizzly", + "Flight of Fear", + "Reptilian", + "Great Pumpkin Coaster", + "Apple Zapple", + "Backlot Stunt Coaster", + "Dominator", + "Pantherian", + "Twisted Timbers", + "Tumbili", + "Rapterra" + ], + "coasters_scraped_at": "2026-04-04T17:46:10.223Z" + }, + "kingsisland": { + "rcdb_id": 4540, + "coasters": [ + "Flight of Fear", + "Beast", + "Racer", + "Adventure Express", + "Woodstock Express", + "Bat", + "Great Pumpkin Coaster", + "Invertigo", + "Diamondback", + "Banshee", + "Orion", + "Mystic Timbers", + "Snoopy's Soap Box Racers", + "Woodstock’s Air Rail", + "Queen City Stunt Coaster" + ], + "coasters_scraped_at": "2026-04-04T17:46:12.251Z" + }, + "valleyfair": { + "rcdb_id": 4552, + "coasters": [ + "High Roller", + "Corkscrew", + "Excalibur", + "Wild Thing", + "Mad Mouse", + "Steel Venom", + "Renegade", + "Cosmic Coaster" + ], + "coasters_scraped_at": "2026-04-04T17:46:14.298Z" + }, + "worldsoffun": { + "rcdb_id": 4533, + "coasters": [ + "Timber Wolf", + "Cosmic Coaster", + "Mamba", + "Spinning Dragons", + "Patriot", + "Prowler", + "Zambezi Zinger", + "Boomerang" + ], + "coasters_scraped_at": "2026-04-04T17:46:16.328Z" + }, + "miadventure": { + "rcdb_id": 4578, + "coasters": [ + "Corkscrew", + "Wolverine Wildcat", + "Zach's Zoomer", + "Shivering Timbers", + "Mad Mouse", + "Thunderhawk", + "Woodstock Express" + ], + "coasters_scraped_at": "2026-04-04T17:46:18.370Z" + }, + "dorneypark": { + "rcdb_id": 4588, + "coasters": [ + "Thunderhawk", + "Steel Force", + "Wild Mouse", + "Woodstock Express", + "Talon", + "Hydra the Revenge", + "Possessed", + "Iron Menace" + ], + "coasters_scraped_at": "2026-04-04T17:46:20.413Z" + }, + "cagreatamerica": { + "rcdb_id": 4541, + "coasters": [ + "Demon", + "Grizzly", + "Woodstock Express", + "Patriot", + "Flight Deck", + "Lucy's Crabbie Cabbies", + "Psycho Mouse", + "Gold Striker", + "RailBlazer" + ], + "coasters_scraped_at": "2026-04-04T17:46:22.465Z" + }, + "frontiercity": { + "rcdb_id": 4559, + "coasters": [ + "Silver Bullet", + "Wildcat", + "Diamondback", + "Steel Lasso", + "Frankie's Mine Train" + ], + "coasters_scraped_at": "2026-04-04T17:46:24.519Z" + } +} diff --git a/lib/park-meta.ts b/lib/park-meta.ts new file mode 100644 index 0000000..0b71d7e --- /dev/null +++ b/lib/park-meta.ts @@ -0,0 +1,86 @@ +/** + * park-meta.json — persisted alongside the SQLite DB in data/ + * + * This file stores per-park metadata that doesn't belong in the schedule DB: + * - rcdb_id: user-supplied RCDB park ID (fills into https://rcdb.com/{id}.htm) + * - coasters: list of operating roller coaster names scraped from RCDB + * - coasters_scraped_at: ISO timestamp of last RCDB scrape + * + * discover.ts: ensures every park has a skeleton entry (rcdb_id null by default) + * scrape.ts: populates coasters[] for parks with a known rcdb_id (30-day staleness) + */ + +import fs from "fs"; +import path from "path"; + +const META_PATH = path.join(process.cwd(), "data", "park-meta.json"); + +export interface ParkMeta { + /** RCDB park page ID — user fills this in manually after discover creates the skeleton */ + rcdb_id: number | null; + /** Operating roller coaster names scraped from RCDB */ + coasters: string[]; + /** ISO timestamp of when coasters was last scraped from RCDB */ + coasters_scraped_at: string | null; +} + +export type ParkMetaMap = Record; + +export function readParkMeta(): ParkMetaMap { + try { + return JSON.parse(fs.readFileSync(META_PATH, "utf8")) as ParkMetaMap; + } catch { + return {}; + } +} + +export function writeParkMeta(meta: ParkMetaMap): void { + fs.mkdirSync(path.dirname(META_PATH), { recursive: true }); + fs.writeFileSync(META_PATH, JSON.stringify(meta, null, 2) + "\n"); +} + +/** Default skeleton entry for a park that has never been configured. */ +export function defaultParkMeta(): ParkMeta { + return { rcdb_id: null, coasters: [], coasters_scraped_at: null }; +} + +const COASTER_STALE_MS = 30 * 24 * 60 * 60 * 1000; // 30 days + +/** Returns true when the coaster list needs to be re-scraped from RCDB. */ +export function areCoastersStale(entry: ParkMeta): boolean { + if (!entry.coasters_scraped_at) return true; + return Date.now() - new Date(entry.coasters_scraped_at).getTime() > COASTER_STALE_MS; +} + +/** + * Normalize a ride name for fuzzy matching between data sources. + * + * Queue-Times uses branded names (BATMAN™ The Ride, THE JOKER™ Funhouse Coaster) + * while RCDB uses clean names (Batman The Ride, Joker Funhouse Coaster). + * + * Normalization steps: + * 1. Strip trademark/copyright symbols (™ ® ©) + * 2. Strip leading "THE " / "THE" prefix + * 3. Replace punctuation (- : ' ") with spaces + * 4. Collapse runs of whitespace + * 5. Lowercase and trim + */ +export function normalizeRideName(name: string): string { + return name + .replace(/[™®©]/g, "") + .replace(/^the\s+/i, "") + .replace(/[-:'".]/g, " ") + .replace(/\s+/g, " ") + .toLowerCase() + .trim(); +} + +/** + * Returns a Set of normalized coaster names for fast membership checks. + * Returns null when no coaster data exists for the park. + */ +export function getCoasterSet(parkId: string, meta: ParkMetaMap): Set | null { + const entry = meta[parkId]; + if (!entry || entry.coasters.length === 0) return null; + return new Set(entry.coasters.map(normalizeRideName)); +} diff --git a/lib/scrapers/queuetimes.ts b/lib/scrapers/queuetimes.ts index d33053c..71e4fcc 100644 --- a/lib/scrapers/queuetimes.ts +++ b/lib/scrapers/queuetimes.ts @@ -9,6 +9,20 @@ const BASE = "https://queue-times.com/parks"; +/** + * Normalize a ride name for fuzzy matching between Queue-Times and RCDB. + * Strips trademark symbols, leading "THE ", and punctuation before comparing. + */ +function normalize(name: string): string { + return name + .replace(/[™®©]/g, "") + .replace(/^the\s+/i, "") + .replace(/[-:'".]/g, " ") + .replace(/\s+/g, " ") + .toLowerCase() + .trim(); +} + const HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + @@ -21,7 +35,7 @@ export interface LiveRide { isOpen: boolean; waitMinutes: number; lastUpdated: string; // ISO 8601 - /** True when Queue-Times placed this ride in a "Coasters" land category. */ + /** True when the ride name appears in the RCDB coaster list for this park. */ isCoaster: boolean; } @@ -58,11 +72,16 @@ interface QTResponse { * - The request fails * - The response contains no rides * + * Pass coasterNames (from RCDB static data) to classify rides accurately. + * Matching is case-insensitive. When coasterNames is null no ride is + * classified as a coaster and the "Coasters only" toggle is hidden. + * * Pass revalidate (seconds) to control Next.js ISR cache lifetime. * Defaults to 300s (5 min) to match Queue-Times update frequency. */ export async function fetchLiveRides( queueTimesId: number, + coasterNames: Set | null = null, revalidate = 300, ): Promise { const url = `${BASE}/${queueTimesId}/queue_times.json`; @@ -78,10 +97,7 @@ export async function fetchLiveRides( const rides: LiveRide[] = []; - // Rides are nested inside lands. Queue-Times labels coaster sections - // with names like "Coasters", "Steel Coasters", "Wooden Coasters", etc. for (const land of json.lands ?? []) { - const isCoaster = land.name.toLowerCase().includes("coaster"); for (const r of land.rides ?? []) { if (!r.name) continue; rides.push({ @@ -89,7 +105,7 @@ export async function fetchLiveRides( isOpen: r.is_open, waitMinutes: r.wait_time ?? 0, lastUpdated: r.last_updated, - isCoaster, + isCoaster: coasterNames ? coasterNames.has(normalize(r.name)) : false, }); } } @@ -102,7 +118,7 @@ export async function fetchLiveRides( isOpen: r.is_open, waitMinutes: r.wait_time ?? 0, lastUpdated: r.last_updated, - isCoaster: false, + isCoaster: coasterNames ? coasterNames.has(normalize(r.name)) : false, }); } diff --git a/lib/scrapers/rcdb.ts b/lib/scrapers/rcdb.ts new file mode 100644 index 0000000..295fd8c --- /dev/null +++ b/lib/scrapers/rcdb.ts @@ -0,0 +1,91 @@ +/** + * RCDB (Roller Coaster DataBase) scraper. + * + * Fetches a park's RCDB page (https://rcdb.com/{id}.htm) and extracts the + * names of operating roller coasters from the "Operating Roller Coasters" + * section. + * + * RCDB has no public API. This scraper reads the static HTML page. + * Please scrape infrequently (30-day staleness window) to be respectful. + */ + +const BASE = "https://rcdb.com"; + +const HEADERS = { + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + Accept: "text/html,application/xhtml+xml", + "Accept-Language": "en-US,en;q=0.9", +}; + +/** + * Scrape operating roller coaster names for a park. + * + * Returns an array of coaster names on success, or null when the page + * cannot be fetched or contains no operating coasters. + */ +export async function scrapeRcdbCoasters(rcdbId: number): Promise { + const url = `${BASE}/${rcdbId}.htm`; + try { + const res = await fetch(url, { headers: HEADERS }); + if (!res.ok) { + console.error(` RCDB ${rcdbId}: HTTP ${res.status}`); + return null; + } + const html = await res.text(); + return parseOperatingCoasters(html); + } catch (err) { + console.error(` RCDB ${rcdbId}: ${err}`); + return null; + } +} + +/** + * Parse operating roller coaster names from RCDB park page HTML. + * + * RCDB park pages list coasters in sections bounded by
tags. + * The operating section heading looks like: + *

Operating Roller Coasters: 16

+ * + * Each coaster is an link to its detail page with an unquoted href: + * Batman The Ride + * + * We extract only those links (href=/DIGITS.htm) from within the + * operating section, stopping at the next
tag. + */ +function parseOperatingCoasters(html: string): string[] { + // Find the "Operating Roller Coasters" section heading. + const opIdx = html.search(/Operating\s+Roller\s+Coasters/i); + if (opIdx === -1) return []; + + // The section ends at the next
tag (e.g. "Defunct Roller Coasters"). + const after = html.slice(opIdx); + const nextSection = after.search(/ 0 ? after.slice(0, nextSection) : after; + + // Extract coaster names from links to RCDB detail pages. + // RCDB uses unquoted href attributes: href=/1234.htm + // General links (/g.htm, /r.htm, /location.htm, etc.) won't match \d+\.htm. + const names: string[] = []; + const linkPattern = /]*href=["']?\/(\d+)\.htm["']?[^>]*>([^<]+)<\/a>/gi; + let match: RegExpExecArray | null; + + while ((match = linkPattern.exec(sectionHtml)) !== null) { + const name = decodeHtmlEntities(match[2].trim()); + if (name) names.push(name); + } + + // Deduplicate while preserving order + return [...new Set(names)]; +} + +function decodeHtmlEntities(text: string): string { + return text + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10))) + .replace(/&[a-z]+;/gi, ""); +} diff --git a/scripts/discover.ts b/scripts/discover.ts index 9e13dc1..1b613a3 100644 --- a/scripts/discover.ts +++ b/scripts/discover.ts @@ -17,6 +17,7 @@ import { chromium } from "playwright"; import { openDb, getApiId, setApiId, type DbInstance } from "../lib/db"; import { PARKS } from "../lib/parks"; import { fetchParkInfo, isMainThemePark } from "../lib/scrapers/sixflags"; +import { readParkMeta, writeParkMeta, defaultParkMeta } from "../lib/park-meta"; const CLOUDFRONT_PATTERN = /operating-hours\/park\/(\d+)/; @@ -124,11 +125,39 @@ async function main() { await new Promise((r) => setTimeout(r, 2000)); } + // ── Ensure park-meta.json has a skeleton entry for every park ──────────── + // Users fill in rcdb_id manually; scrape.ts populates coasters[] from RCDB. + const meta = readParkMeta(); + let metaChanged = false; + + for (const park of PARKS) { + if (!meta[park.id]) { + meta[park.id] = defaultParkMeta(); + metaChanged = true; + } + } + // Remove entries for parks no longer in the registry + for (const id of Object.keys(meta)) { + if (!PARKS.find((p) => p.id === id)) { + delete meta[id]; + metaChanged = true; + } + } + + if (metaChanged) { + writeParkMeta(meta); + console.log("\nUpdated data/park-meta.json"); + console.log(" → Set rcdb_id for each park to enable the coaster filter."); + console.log(" Find a park's RCDB ID from: https://rcdb.com (the number in the URL)."); + } + // Print summary console.log("\n── Discovered IDs ──"); for (const park of PARKS) { const id = getApiId(db, park.id); - console.log(` ${park.id.padEnd(30)} ${id ?? "NOT FOUND"}`); + const rcdbId = meta[park.id]?.rcdb_id; + const rcdbStr = rcdbId ? `rcdb:${rcdbId}` : "rcdb:?"; + console.log(` ${park.id.padEnd(30)} api:${String(id ?? "?").padEnd(8)} ${rcdbStr}`); } db.close(); diff --git a/scripts/scrape.ts b/scripts/scrape.ts index 604cee9..b3977d7 100644 --- a/scripts/scrape.ts +++ b/scripts/scrape.ts @@ -10,6 +10,8 @@ import { openDb, upsertDay, getApiId, isMonthScraped } from "../lib/db"; import { PARKS } from "../lib/parks"; import { scrapeMonth, RateLimitError } from "../lib/scrapers/sixflags"; +import { readParkMeta, writeParkMeta, areCoastersStale } from "../lib/park-meta"; +import { scrapeRcdbCoasters } from "../lib/scrapers/rcdb"; const YEAR = 2026; const MONTHS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; @@ -99,6 +101,42 @@ async function main() { if (totalErrors > 0) console.log(" Re-run to retry failed months."); db.close(); + + // ── RCDB coaster scrape (30-day staleness) ──────────────────────────────── + const meta = readParkMeta(); + const rcdbParks = PARKS.filter((p) => { + const entry = meta[p.id]; + return entry?.rcdb_id && (FORCE || areCoastersStale(entry)); + }); + + if (rcdbParks.length === 0) { + console.log("\nCoaster data up to date."); + return; + } + + console.log(`\n── RCDB coaster scrape — ${rcdbParks.length} park(s) ──`); + + for (const park of rcdbParks) { + const entry = meta[park.id]; + const rcdbId = entry.rcdb_id!; + process.stdout.write(` ${park.shortName.padEnd(30)} `); + + const coasters = await scrapeRcdbCoasters(rcdbId); + if (coasters === null) { + console.log("FAILED"); + continue; + } + + entry.coasters = coasters; + entry.coasters_scraped_at = new Date().toISOString(); + console.log(`${coasters.length} coasters`); + + // Polite delay between RCDB requests + await new Promise((r) => setTimeout(r, 2000)); + } + + writeParkMeta(meta); + console.log(" Saved to data/park-meta.json"); } main().catch((err) => {