feat: RCDB-backed roller coaster filter with fuzzy name matching
All checks were successful
Build and Deploy / Build & Push (push) Successful in 2m54s

- Add lib/park-meta.ts to manage data/park-meta.json (rcdb_id + coaster lists)
- Add lib/scrapers/rcdb.ts to scrape operating coaster names from RCDB park pages
- discover.ts now seeds park-meta.json with skeleton entries for all parks
- scrape.ts now refreshes RCDB coaster lists (30-day staleness) for parks with rcdb_id set
- fetchLiveRides() accepts a coasterNames Set; isCoaster uses normalize() on both sides
  to handle trademark symbols, 'THE ' prefixes, and punctuation differences between
  Queue-Times and RCDB names — applies correctly to both land rides and top-level rides
- Commit park-meta.json so it ships in the Docker image (fresh volumes get it automatically)
- Update .gitignore / .dockerignore to exclude only *.db files, not all of data/
- Dockerfile copies park-meta.json into image before VOLUME declaration
- README: document coaster filter setup and correct staleness window (72h not 7d)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-04 13:49:49 -04:00
parent 819e716197
commit 9700d0bd9a
11 changed files with 710 additions and 15 deletions

View File

@@ -2,10 +2,9 @@
.gitea .gitea
.next .next
node_modules node_modules
data/ data/*.db
*.db data/*.db-shm
*.db-shm data/*.db-wal
*.db-wal
.env* .env*
npm-debug.log* npm-debug.log*
.DS_Store .DS_Store

4
.gitignore vendored
View File

@@ -27,7 +27,9 @@ yarn-debug.log*
yarn-error.log* yarn-error.log*
# scraped data — local only, not committed # scraped data — local only, not committed
/data/ /data/*.db
/data/*.db-shm
/data/*.db-wal
# env files # env files
.env* .env*

View File

@@ -44,8 +44,13 @@ COPY --from=deps --chown=nextjs:nodejs /app/node_modules ./node_modules
RUN npx playwright install --with-deps chromium && \ RUN npx playwright install --with-deps chromium && \
chown -R nextjs:nodejs /app/.playwright chown -R nextjs:nodejs /app/.playwright
# SQLite data directory — mount a named volume here for persistence # Seed data directory with park-meta.json (RCDB coaster lists + rcdb_id mappings).
# Must be copied before VOLUME so Docker initialises a fresh named volume with
# this file already present. Existing volumes retain their own copy.
RUN mkdir -p /app/data && chown nextjs:nodejs /app/data RUN mkdir -p /app/data && chown nextjs:nodejs /app/data
COPY --from=builder --chown=nextjs:nodejs /app/data/park-meta.json ./data/park-meta.json
# SQLite database lives here — mount a named volume for persistence
VOLUME ["/app/data"] VOLUME ["/app/data"]
USER nextjs USER nextjs

View File

@@ -31,6 +31,13 @@ The park detail page shows ride open/closed status using a two-tier approach:
2. **Schedule fallback (Six Flags API)** — the Six Flags operating-hours API drops the current day from its response once a park opens. When Queue-Times data is unavailable, the app falls back to the nearest upcoming date from the Six Flags schedule API as an approximation. 2. **Schedule fallback (Six Flags API)** — the Six Flags operating-hours API drops the current day from its response once a park opens. When Queue-Times data is unavailable, the app falls back to the nearest upcoming date from the Six Flags schedule API as an approximation.
### Roller Coaster Filter
When live data is shown, a **Coasters only** toggle appears if roller coaster data has been populated for that park. Coaster lists are sourced from [RCDB](https://rcdb.com) and stored in `data/park-meta.json`. To populate them:
1. Open `data/park-meta.json` and set `rcdb_id` for each park to the numeric RCDB park ID (visible in the URL: `https://rcdb.com/4529.htm``4529`).
2. Run `npm run scrape` — coaster lists are fetched from RCDB and stored in the JSON file. They refresh automatically every 30 days on subsequent scrapes.
--- ---
## Local Development ## Local Development
@@ -111,4 +118,6 @@ docker run --rm -v sixflagssupercalendar_park_data:/app/data \
## Data Refresh ## Data Refresh
The scraper skips any park + month combination scraped within the last 7 days. Run `npm run scrape` on a weekly schedule to keep data current. Parks or months not yet in the database show a `—` placeholder; parks with no open days in the displayed week are hidden from the calendar automatically. The scraper skips any park + month already scraped within the last 72 hours. The nightly Docker scraper service handles this automatically. Parks or months not yet in the database show a `—` placeholder; parks with no open days in the displayed week are hidden from the calendar automatically.
Roller coaster lists (from RCDB) are refreshed every 30 days on each `npm run scrape` run, for parks with a configured `rcdb_id`.

View File

@@ -5,6 +5,7 @@ import { openDb, getParkMonthData, getApiId } from "@/lib/db";
import { scrapeRidesForDay } from "@/lib/scrapers/sixflags"; import { scrapeRidesForDay } from "@/lib/scrapers/sixflags";
import { fetchLiveRides } from "@/lib/scrapers/queuetimes"; import { fetchLiveRides } from "@/lib/scrapers/queuetimes";
import { QUEUE_TIMES_IDS } from "@/lib/queue-times-map"; import { QUEUE_TIMES_IDS } from "@/lib/queue-times-map";
import { readParkMeta, getCoasterSet } from "@/lib/park-meta";
import { ParkMonthCalendar } from "@/components/ParkMonthCalendar"; import { ParkMonthCalendar } from "@/components/ParkMonthCalendar";
import { LiveRidePanel } from "@/components/LiveRidePanel"; import { LiveRidePanel } from "@/components/LiveRidePanel";
import type { RideStatus, RidesFetchResult } from "@/lib/scrapers/sixflags"; import type { RideStatus, RidesFetchResult } from "@/lib/scrapers/sixflags";
@@ -46,11 +47,14 @@ export default async function ParkPage({ params, searchParams }: PageProps) {
// ── Ride data: try live Queue-Times first, fall back to schedule ────────── // ── Ride data: try live Queue-Times first, fall back to schedule ──────────
const queueTimesId = QUEUE_TIMES_IDS[id]; const queueTimesId = QUEUE_TIMES_IDS[id];
const parkMeta = readParkMeta();
const coasterSet = getCoasterSet(id, parkMeta);
let liveRides: LiveRidesResult | null = null; let liveRides: LiveRidesResult | null = null;
let ridesResult: RidesFetchResult | null = null; let ridesResult: RidesFetchResult | null = null;
if (queueTimesId) { if (queueTimesId) {
liveRides = await fetchLiveRides(queueTimesId); liveRides = await fetchLiveRides(queueTimesId, coasterSet);
} }
// Only hit the schedule API as a fallback when live data is unavailable // Only hit the schedule API as a fallback when live data is unavailable

416
data/park-meta.json Normal file
View File

@@ -0,0 +1,416 @@
{
"greatadventure": {
"rcdb_id": 4534,
"coasters": [
"Superman - Ultimate Flight",
"El Toro",
"Dark Knight",
"Joker",
"Jersey Devil Coaster",
"Lil' Devil Coaster",
"Flash: Vertical Velocity",
"Batman The Ride",
"Skull Mountain",
"Runaway Mine Train",
"Medusa",
"Harley Quinn Crazy Train",
"Nitro"
],
"coasters_scraped_at": "2026-04-04T17:40:09.731Z"
},
"magicmountain": {
"rcdb_id": 4532,
"coasters": [
"Ninja",
"New Revolution",
"Batman The Ride",
"Viper",
"Gold Rusher",
"Riddler's Revenge",
"Canyon Blaster",
"Goliath",
"X2",
"Scream!",
"Tatsu",
"Apocalypse the Ride",
"Road Runner Express",
"Speedy Gonzales Hot Rod Racers",
"Full Throttle",
"Twisted Colossus",
"West Coast Racers",
"Wonder Woman Flight of Courage"
],
"coasters_scraped_at": "2026-04-04T17:45:43.666Z"
},
"greatamerica": {
"rcdb_id": 4530,
"coasters": [
"Demon",
"Batman The Ride",
"American Eagle",
"Viper",
"Whizzer",
"Sprocket Rockets",
"Raging Bull",
"Flash: Vertical Velocity",
"Superman - Ultimate Flight",
"Dark Knight",
"Little Dipper",
"Goliath",
"X-Flight",
"Joker",
"Maxx Force",
"Wrath of Rakshasa"
],
"coasters_scraped_at": "2026-04-04T17:29:24.092Z"
},
"overgeorgia": {
"rcdb_id": 4535,
"coasters": [
"Blue Hawk",
"Great American Scream Machine",
"Dahlonega Mine Train",
"Batman The Ride",
"Georgia Scorcher",
"Superman - Ultimate Flight",
"Joker Funhouse Coaster",
"Goliath",
"Dare Devil Dive",
"Twisted Cyclone",
"Riddler Mindbender",
"Georgia Gold Rusher"
],
"coasters_scraped_at": "2026-04-04T17:29:26.121Z"
},
"overtexas": {
"rcdb_id": 4531,
"coasters": [
"Pandemonium",
"New Texas Giant",
"Joker",
"Aquaman: Power Wave",
"Shock Wave",
"Judge Roy Scream",
"Runaway Mine Train",
"Runaway Mountain",
"Mini Mine Train",
"Mr. Freeze",
"Batman The Ride",
"Titan",
"Wile E. Coyote's Grand Canyon Blaster"
],
"coasters_scraped_at": "2026-04-04T17:45:45.715Z"
},
"stlouis": {
"rcdb_id": 4536,
"coasters": [
"Ninja",
"River King Mine Train",
"Mr. Freeze Reverse Blast",
"Batman The Ride",
"Screamin' Eagle",
"Boss",
"Pandemonium",
"American Thunder",
"Boomerang",
"Rookie Racer"
],
"coasters_scraped_at": "2026-04-04T17:45:47.770Z"
},
"fiestatexas": {
"rcdb_id": 4538,
"coasters": [
"Batgirl Coaster Chase",
"Road Runner Express",
"Poltergeist",
"Boomerang Coast to Coaster",
"Superman Krypton Coaster",
"Pandemonium",
"Chupacabra",
"Iron Rattler",
"Batman The Ride",
"Wonder Woman Golden Lasso Coaster",
"Dr. Diabolical's Cliffhanger"
],
"coasters_scraped_at": "2026-04-04T17:45:49.819Z"
},
"newengland": {
"rcdb_id": 4565,
"coasters": [
"Joker",
"Thunderbolt",
"Great Chase",
"Riddler Revenge",
"Superman the Ride",
"Flashback",
"Catwoman's Whip",
"Pandemonium",
"Batman - The Dark Knight",
"Wicked Cyclone",
"Gotham City Gauntlet Escape from Arkham Asylum"
],
"coasters_scraped_at": "2026-04-04T17:45:51.866Z"
},
"discoverykingdom": {
"rcdb_id": 4711,
"coasters": [
"Roadrunner Express",
"Medusa",
"Cobra",
"Flash: Vertical Velocity",
"Kong",
"Boomerang",
"Superman Ultimate Flight",
"Joker",
"Batman The Ride",
"Sidewinder Safari"
],
"coasters_scraped_at": "2026-04-04T17:45:53.909Z"
},
"mexico": {
"rcdb_id": 4629,
"coasters": [
"Tsunami",
"Superman Krypton Coaster",
"Batgirl Batarang",
"Batman The Ride",
"Superman el Último Escape",
"Dark Knight",
"Joker",
"Medusa Steel Coaster",
"Wonder Woman",
"Speedway Stunt Coaster"
],
"coasters_scraped_at": "2026-04-04T17:45:55.963Z"
},
"greatescape": {
"rcdb_id": 4596,
"coasters": [
"Comet",
"Steamin' Demon",
"Flashback",
"Canyon Blaster",
"Frankie's Mine Train",
"Bobcat"
],
"coasters_scraped_at": "2026-04-04T17:45:58.013Z"
},
"darienlake": {
"rcdb_id": 4581,
"coasters": [
"Predator",
"Viper",
"Mind Eraser",
"Boomerang",
"Ride of Steel",
"Hoot N Holler",
"Moto Coaster",
"Tantrum"
],
"coasters_scraped_at": "2026-04-04T17:46:00.042Z"
},
"cedarpoint": {
"rcdb_id": 4529,
"coasters": [
"Raptor",
"Rougarou",
"Magnum XL-200",
"Blue Streak",
"Corkscrew",
"Gemini",
"Wilderness Run",
"Woodstock Express",
"Millennium Force",
"Iron Dragon",
"Cedar Creek Mine Ride",
"Maverick",
"GateKeeper",
"Valravn",
"Steel Vengeance",
"Top Thrill 2",
"Wild Mouse",
"Sirens Curse"
],
"coasters_scraped_at": "2026-04-04T17:46:02.082Z"
},
"knotts": {
"rcdb_id": 4546,
"coasters": [
"Jaguar!",
"GhostRider",
"Xcelerator",
"Silver Bullet",
"Sierra Sidewinder",
"Pony Express",
"Coast Rider",
"HangTime",
"Snoopys Tenderpaw Twister Coaster"
],
"coasters_scraped_at": "2026-04-04T17:46:04.120Z"
},
"canadaswonderland": {
"rcdb_id": 4539,
"coasters": [
"Flight Deck",
"Dragon Fyre",
"Mighty Canadian Minebuster",
"Wilde Beast",
"Ghoster Coaster",
"Thunder Run",
"Bat",
"Vortex",
"Taxi Jam",
"Fly",
"Silver Streak",
"Backlot Stunt Coaster",
"Behemoth",
"Leviathan",
"Wonder Mountain's Guardian",
"Yukon Striker",
"Snoopy's Racing Railway",
"AlpenFury"
],
"coasters_scraped_at": "2026-04-04T17:46:06.152Z"
},
"carowinds": {
"rcdb_id": 4542,
"coasters": [
"Carolina Cyclone",
"Woodstock Express",
"Carolina Goldrusher",
"Hurler",
"Vortex",
"Wilderness Run",
"Afterburn",
"Flying Cobras",
"Thunder Striker",
"Fury 325",
"Copperhead Strike",
"Snoopys Racing Railway",
"Ricochet",
"Kiddy Hawk"
],
"coasters_scraped_at": "2026-04-04T17:46:08.185Z"
},
"kingsdominion": {
"rcdb_id": 4544,
"coasters": [
"Racer 75",
"Woodstock Express",
"Grizzly",
"Flight of Fear",
"Reptilian",
"Great Pumpkin Coaster",
"Apple Zapple",
"Backlot Stunt Coaster",
"Dominator",
"Pantherian",
"Twisted Timbers",
"Tumbili",
"Rapterra"
],
"coasters_scraped_at": "2026-04-04T17:46:10.223Z"
},
"kingsisland": {
"rcdb_id": 4540,
"coasters": [
"Flight of Fear",
"Beast",
"Racer",
"Adventure Express",
"Woodstock Express",
"Bat",
"Great Pumpkin Coaster",
"Invertigo",
"Diamondback",
"Banshee",
"Orion",
"Mystic Timbers",
"Snoopy's Soap Box Racers",
"Woodstocks Air Rail",
"Queen City Stunt Coaster"
],
"coasters_scraped_at": "2026-04-04T17:46:12.251Z"
},
"valleyfair": {
"rcdb_id": 4552,
"coasters": [
"High Roller",
"Corkscrew",
"Excalibur",
"Wild Thing",
"Mad Mouse",
"Steel Venom",
"Renegade",
"Cosmic Coaster"
],
"coasters_scraped_at": "2026-04-04T17:46:14.298Z"
},
"worldsoffun": {
"rcdb_id": 4533,
"coasters": [
"Timber Wolf",
"Cosmic Coaster",
"Mamba",
"Spinning Dragons",
"Patriot",
"Prowler",
"Zambezi Zinger",
"Boomerang"
],
"coasters_scraped_at": "2026-04-04T17:46:16.328Z"
},
"miadventure": {
"rcdb_id": 4578,
"coasters": [
"Corkscrew",
"Wolverine Wildcat",
"Zach's Zoomer",
"Shivering Timbers",
"Mad Mouse",
"Thunderhawk",
"Woodstock Express"
],
"coasters_scraped_at": "2026-04-04T17:46:18.370Z"
},
"dorneypark": {
"rcdb_id": 4588,
"coasters": [
"Thunderhawk",
"Steel Force",
"Wild Mouse",
"Woodstock Express",
"Talon",
"Hydra the Revenge",
"Possessed",
"Iron Menace"
],
"coasters_scraped_at": "2026-04-04T17:46:20.413Z"
},
"cagreatamerica": {
"rcdb_id": 4541,
"coasters": [
"Demon",
"Grizzly",
"Woodstock Express",
"Patriot",
"Flight Deck",
"Lucy's Crabbie Cabbies",
"Psycho Mouse",
"Gold Striker",
"RailBlazer"
],
"coasters_scraped_at": "2026-04-04T17:46:22.465Z"
},
"frontiercity": {
"rcdb_id": 4559,
"coasters": [
"Silver Bullet",
"Wildcat",
"Diamondback",
"Steel Lasso",
"Frankie's Mine Train"
],
"coasters_scraped_at": "2026-04-04T17:46:24.519Z"
}
}

86
lib/park-meta.ts Normal file
View File

@@ -0,0 +1,86 @@
/**
* park-meta.json — persisted alongside the SQLite DB in data/
*
* This file stores per-park metadata that doesn't belong in the schedule DB:
* - rcdb_id: user-supplied RCDB park ID (fills into https://rcdb.com/{id}.htm)
* - coasters: list of operating roller coaster names scraped from RCDB
* - coasters_scraped_at: ISO timestamp of last RCDB scrape
*
* discover.ts: ensures every park has a skeleton entry (rcdb_id null by default)
* scrape.ts: populates coasters[] for parks with a known rcdb_id (30-day staleness)
*/
import fs from "fs";
import path from "path";
const META_PATH = path.join(process.cwd(), "data", "park-meta.json");
export interface ParkMeta {
/** RCDB park page ID — user fills this in manually after discover creates the skeleton */
rcdb_id: number | null;
/** Operating roller coaster names scraped from RCDB */
coasters: string[];
/** ISO timestamp of when coasters was last scraped from RCDB */
coasters_scraped_at: string | null;
}
export type ParkMetaMap = Record<string, ParkMeta>;
export function readParkMeta(): ParkMetaMap {
try {
return JSON.parse(fs.readFileSync(META_PATH, "utf8")) as ParkMetaMap;
} catch {
return {};
}
}
export function writeParkMeta(meta: ParkMetaMap): void {
fs.mkdirSync(path.dirname(META_PATH), { recursive: true });
fs.writeFileSync(META_PATH, JSON.stringify(meta, null, 2) + "\n");
}
/** Default skeleton entry for a park that has never been configured. */
export function defaultParkMeta(): ParkMeta {
return { rcdb_id: null, coasters: [], coasters_scraped_at: null };
}
const COASTER_STALE_MS = 30 * 24 * 60 * 60 * 1000; // 30 days
/** Returns true when the coaster list needs to be re-scraped from RCDB. */
export function areCoastersStale(entry: ParkMeta): boolean {
if (!entry.coasters_scraped_at) return true;
return Date.now() - new Date(entry.coasters_scraped_at).getTime() > COASTER_STALE_MS;
}
/**
* Normalize a ride name for fuzzy matching between data sources.
*
* Queue-Times uses branded names (BATMAN™ The Ride, THE JOKER™ Funhouse Coaster)
* while RCDB uses clean names (Batman The Ride, Joker Funhouse Coaster).
*
* Normalization steps:
* 1. Strip trademark/copyright symbols (™ ® ©)
* 2. Strip leading "THE " / "THE" prefix
* 3. Replace punctuation (- : ' ") with spaces
* 4. Collapse runs of whitespace
* 5. Lowercase and trim
*/
export function normalizeRideName(name: string): string {
return name
.replace(/[™®©]/g, "")
.replace(/^the\s+/i, "")
.replace(/[-:'".]/g, " ")
.replace(/\s+/g, " ")
.toLowerCase()
.trim();
}
/**
* Returns a Set of normalized coaster names for fast membership checks.
* Returns null when no coaster data exists for the park.
*/
export function getCoasterSet(parkId: string, meta: ParkMetaMap): Set<string> | null {
const entry = meta[parkId];
if (!entry || entry.coasters.length === 0) return null;
return new Set(entry.coasters.map(normalizeRideName));
}

View File

@@ -9,6 +9,20 @@
const BASE = "https://queue-times.com/parks"; const BASE = "https://queue-times.com/parks";
/**
* Normalize a ride name for fuzzy matching between Queue-Times and RCDB.
* Strips trademark symbols, leading "THE ", and punctuation before comparing.
*/
function normalize(name: string): string {
return name
.replace(/[™®©]/g, "")
.replace(/^the\s+/i, "")
.replace(/[-:'".]/g, " ")
.replace(/\s+/g, " ")
.toLowerCase()
.trim();
}
const HEADERS = { const HEADERS = {
"User-Agent": "User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
@@ -21,7 +35,7 @@ export interface LiveRide {
isOpen: boolean; isOpen: boolean;
waitMinutes: number; waitMinutes: number;
lastUpdated: string; // ISO 8601 lastUpdated: string; // ISO 8601
/** True when Queue-Times placed this ride in a "Coasters" land category. */ /** True when the ride name appears in the RCDB coaster list for this park. */
isCoaster: boolean; isCoaster: boolean;
} }
@@ -58,11 +72,16 @@ interface QTResponse {
* - The request fails * - The request fails
* - The response contains no rides * - The response contains no rides
* *
* Pass coasterNames (from RCDB static data) to classify rides accurately.
* Matching is case-insensitive. When coasterNames is null no ride is
* classified as a coaster and the "Coasters only" toggle is hidden.
*
* Pass revalidate (seconds) to control Next.js ISR cache lifetime. * Pass revalidate (seconds) to control Next.js ISR cache lifetime.
* Defaults to 300s (5 min) to match Queue-Times update frequency. * Defaults to 300s (5 min) to match Queue-Times update frequency.
*/ */
export async function fetchLiveRides( export async function fetchLiveRides(
queueTimesId: number, queueTimesId: number,
coasterNames: Set<string> | null = null,
revalidate = 300, revalidate = 300,
): Promise<LiveRidesResult | null> { ): Promise<LiveRidesResult | null> {
const url = `${BASE}/${queueTimesId}/queue_times.json`; const url = `${BASE}/${queueTimesId}/queue_times.json`;
@@ -78,10 +97,7 @@ export async function fetchLiveRides(
const rides: LiveRide[] = []; const rides: LiveRide[] = [];
// Rides are nested inside lands. Queue-Times labels coaster sections
// with names like "Coasters", "Steel Coasters", "Wooden Coasters", etc.
for (const land of json.lands ?? []) { for (const land of json.lands ?? []) {
const isCoaster = land.name.toLowerCase().includes("coaster");
for (const r of land.rides ?? []) { for (const r of land.rides ?? []) {
if (!r.name) continue; if (!r.name) continue;
rides.push({ rides.push({
@@ -89,7 +105,7 @@ export async function fetchLiveRides(
isOpen: r.is_open, isOpen: r.is_open,
waitMinutes: r.wait_time ?? 0, waitMinutes: r.wait_time ?? 0,
lastUpdated: r.last_updated, lastUpdated: r.last_updated,
isCoaster, isCoaster: coasterNames ? coasterNames.has(normalize(r.name)) : false,
}); });
} }
} }
@@ -102,7 +118,7 @@ export async function fetchLiveRides(
isOpen: r.is_open, isOpen: r.is_open,
waitMinutes: r.wait_time ?? 0, waitMinutes: r.wait_time ?? 0,
lastUpdated: r.last_updated, lastUpdated: r.last_updated,
isCoaster: false, isCoaster: coasterNames ? coasterNames.has(normalize(r.name)) : false,
}); });
} }

91
lib/scrapers/rcdb.ts Normal file
View File

@@ -0,0 +1,91 @@
/**
* RCDB (Roller Coaster DataBase) scraper.
*
* Fetches a park's RCDB page (https://rcdb.com/{id}.htm) and extracts the
* names of operating roller coasters from the "Operating Roller Coasters"
* section.
*
* RCDB has no public API. This scraper reads the static HTML page.
* Please scrape infrequently (30-day staleness window) to be respectful.
*/
const BASE = "https://rcdb.com";
const HEADERS = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
};
/**
* Scrape operating roller coaster names for a park.
*
* Returns an array of coaster names on success, or null when the page
* cannot be fetched or contains no operating coasters.
*/
export async function scrapeRcdbCoasters(rcdbId: number): Promise<string[] | null> {
const url = `${BASE}/${rcdbId}.htm`;
try {
const res = await fetch(url, { headers: HEADERS });
if (!res.ok) {
console.error(` RCDB ${rcdbId}: HTTP ${res.status}`);
return null;
}
const html = await res.text();
return parseOperatingCoasters(html);
} catch (err) {
console.error(` RCDB ${rcdbId}: ${err}`);
return null;
}
}
/**
* Parse operating roller coaster names from RCDB park page HTML.
*
* RCDB park pages list coasters in sections bounded by <section> tags.
* The operating section heading looks like:
* <h4>Operating Roller Coasters: <a href="...">16</a></h4>
*
* Each coaster is an <a> link to its detail page with an unquoted href:
* <td data-sort="Batman The Ride"><a href=/5.htm>Batman The Ride</a>
*
* We extract only those links (href=/DIGITS.htm) from within the
* operating section, stopping at the next <section> tag.
*/
function parseOperatingCoasters(html: string): string[] {
// Find the "Operating Roller Coasters" section heading.
const opIdx = html.search(/Operating\s+Roller\s+Coasters/i);
if (opIdx === -1) return [];
// The section ends at the next <section> tag (e.g. "Defunct Roller Coasters").
const after = html.slice(opIdx);
const nextSection = after.search(/<section\b/i);
const sectionHtml = nextSection > 0 ? after.slice(0, nextSection) : after;
// Extract coaster names from links to RCDB detail pages.
// RCDB uses unquoted href attributes: href=/1234.htm
// General links (/g.htm, /r.htm, /location.htm, etc.) won't match \d+\.htm.
const names: string[] = [];
const linkPattern = /<a\s[^>]*href=["']?\/(\d+)\.htm["']?[^>]*>([^<]+)<\/a>/gi;
let match: RegExpExecArray | null;
while ((match = linkPattern.exec(sectionHtml)) !== null) {
const name = decodeHtmlEntities(match[2].trim());
if (name) names.push(name);
}
// Deduplicate while preserving order
return [...new Set(names)];
}
function decodeHtmlEntities(text: string): string {
return text
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
.replace(/&[a-z]+;/gi, "");
}

View File

@@ -17,6 +17,7 @@ import { chromium } from "playwright";
import { openDb, getApiId, setApiId, type DbInstance } from "../lib/db"; import { openDb, getApiId, setApiId, type DbInstance } from "../lib/db";
import { PARKS } from "../lib/parks"; import { PARKS } from "../lib/parks";
import { fetchParkInfo, isMainThemePark } from "../lib/scrapers/sixflags"; import { fetchParkInfo, isMainThemePark } from "../lib/scrapers/sixflags";
import { readParkMeta, writeParkMeta, defaultParkMeta } from "../lib/park-meta";
const CLOUDFRONT_PATTERN = /operating-hours\/park\/(\d+)/; const CLOUDFRONT_PATTERN = /operating-hours\/park\/(\d+)/;
@@ -124,11 +125,39 @@ async function main() {
await new Promise((r) => setTimeout(r, 2000)); await new Promise((r) => setTimeout(r, 2000));
} }
// ── Ensure park-meta.json has a skeleton entry for every park ────────────
// Users fill in rcdb_id manually; scrape.ts populates coasters[] from RCDB.
const meta = readParkMeta();
let metaChanged = false;
for (const park of PARKS) {
if (!meta[park.id]) {
meta[park.id] = defaultParkMeta();
metaChanged = true;
}
}
// Remove entries for parks no longer in the registry
for (const id of Object.keys(meta)) {
if (!PARKS.find((p) => p.id === id)) {
delete meta[id];
metaChanged = true;
}
}
if (metaChanged) {
writeParkMeta(meta);
console.log("\nUpdated data/park-meta.json");
console.log(" → Set rcdb_id for each park to enable the coaster filter.");
console.log(" Find a park's RCDB ID from: https://rcdb.com (the number in the URL).");
}
// Print summary // Print summary
console.log("\n── Discovered IDs ──"); console.log("\n── Discovered IDs ──");
for (const park of PARKS) { for (const park of PARKS) {
const id = getApiId(db, park.id); const id = getApiId(db, park.id);
console.log(` ${park.id.padEnd(30)} ${id ?? "NOT FOUND"}`); const rcdbId = meta[park.id]?.rcdb_id;
const rcdbStr = rcdbId ? `rcdb:${rcdbId}` : "rcdb:?";
console.log(` ${park.id.padEnd(30)} api:${String(id ?? "?").padEnd(8)} ${rcdbStr}`);
} }
db.close(); db.close();

View File

@@ -10,6 +10,8 @@
import { openDb, upsertDay, getApiId, isMonthScraped } from "../lib/db"; import { openDb, upsertDay, getApiId, isMonthScraped } from "../lib/db";
import { PARKS } from "../lib/parks"; import { PARKS } from "../lib/parks";
import { scrapeMonth, RateLimitError } from "../lib/scrapers/sixflags"; import { scrapeMonth, RateLimitError } from "../lib/scrapers/sixflags";
import { readParkMeta, writeParkMeta, areCoastersStale } from "../lib/park-meta";
import { scrapeRcdbCoasters } from "../lib/scrapers/rcdb";
const YEAR = 2026; const YEAR = 2026;
const MONTHS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]; const MONTHS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
@@ -99,6 +101,42 @@ async function main() {
if (totalErrors > 0) console.log(" Re-run to retry failed months."); if (totalErrors > 0) console.log(" Re-run to retry failed months.");
db.close(); db.close();
// ── RCDB coaster scrape (30-day staleness) ────────────────────────────────
const meta = readParkMeta();
const rcdbParks = PARKS.filter((p) => {
const entry = meta[p.id];
return entry?.rcdb_id && (FORCE || areCoastersStale(entry));
});
if (rcdbParks.length === 0) {
console.log("\nCoaster data up to date.");
return;
}
console.log(`\n── RCDB coaster scrape — ${rcdbParks.length} park(s) ──`);
for (const park of rcdbParks) {
const entry = meta[park.id];
const rcdbId = entry.rcdb_id!;
process.stdout.write(` ${park.shortName.padEnd(30)} `);
const coasters = await scrapeRcdbCoasters(rcdbId);
if (coasters === null) {
console.log("FAILED");
continue;
}
entry.coasters = coasters;
entry.coasters_scraped_at = new Date().toISOString();
console.log(`${coasters.length} coasters`);
// Polite delay between RCDB requests
await new Promise((r) => setTimeout(r, 2000));
}
writeParkMeta(meta);
console.log(" Saved to data/park-meta.json");
} }
main().catch((err) => { main().catch((err) => {