feat: RCDB-backed roller coaster filter with fuzzy name matching
All checks were successful
Build and Deploy / Build & Push (push) Successful in 2m54s

- Add lib/park-meta.ts to manage data/park-meta.json (rcdb_id + coaster lists)
- Add lib/scrapers/rcdb.ts to scrape operating coaster names from RCDB park pages
- discover.ts now seeds park-meta.json with skeleton entries for all parks
- scrape.ts now refreshes RCDB coaster lists (30-day staleness) for parks with rcdb_id set
- fetchLiveRides() accepts a coasterNames Set; isCoaster uses normalize() on both sides
  to handle trademark symbols, 'THE ' prefixes, and punctuation differences between
  Queue-Times and RCDB names — applies correctly to both land rides and top-level rides
- Commit park-meta.json so it ships in the Docker image (fresh volumes get it automatically)
- Update .gitignore / .dockerignore to exclude only *.db files, not all of data/
- Dockerfile copies park-meta.json into image before VOLUME declaration
- README: document coaster filter setup and correct staleness window (72h not 7d)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-04 13:49:49 -04:00
parent 819e716197
commit 9700d0bd9a
11 changed files with 710 additions and 15 deletions

86
lib/park-meta.ts Normal file
View File

@@ -0,0 +1,86 @@
/**
* park-meta.json — persisted alongside the SQLite DB in data/
*
* This file stores per-park metadata that doesn't belong in the schedule DB:
* - rcdb_id: user-supplied RCDB park ID (fills into https://rcdb.com/{id}.htm)
* - coasters: list of operating roller coaster names scraped from RCDB
* - coasters_scraped_at: ISO timestamp of last RCDB scrape
*
* discover.ts: ensures every park has a skeleton entry (rcdb_id null by default)
* scrape.ts: populates coasters[] for parks with a known rcdb_id (30-day staleness)
*/
import fs from "fs";
import path from "path";
const META_PATH = path.join(process.cwd(), "data", "park-meta.json");
export interface ParkMeta {
/** RCDB park page ID — user fills this in manually after discover creates the skeleton */
rcdb_id: number | null;
/** Operating roller coaster names scraped from RCDB */
coasters: string[];
/** ISO timestamp of when coasters was last scraped from RCDB */
coasters_scraped_at: string | null;
}
export type ParkMetaMap = Record<string, ParkMeta>;
export function readParkMeta(): ParkMetaMap {
try {
return JSON.parse(fs.readFileSync(META_PATH, "utf8")) as ParkMetaMap;
} catch {
return {};
}
}
export function writeParkMeta(meta: ParkMetaMap): void {
fs.mkdirSync(path.dirname(META_PATH), { recursive: true });
fs.writeFileSync(META_PATH, JSON.stringify(meta, null, 2) + "\n");
}
/** Default skeleton entry for a park that has never been configured. */
export function defaultParkMeta(): ParkMeta {
return { rcdb_id: null, coasters: [], coasters_scraped_at: null };
}
const COASTER_STALE_MS = 30 * 24 * 60 * 60 * 1000; // 30 days
/** Returns true when the coaster list needs to be re-scraped from RCDB. */
export function areCoastersStale(entry: ParkMeta): boolean {
if (!entry.coasters_scraped_at) return true;
return Date.now() - new Date(entry.coasters_scraped_at).getTime() > COASTER_STALE_MS;
}
/**
* Normalize a ride name for fuzzy matching between data sources.
*
* Queue-Times uses branded names (BATMAN™ The Ride, THE JOKER™ Funhouse Coaster)
* while RCDB uses clean names (Batman The Ride, Joker Funhouse Coaster).
*
* Normalization steps:
* 1. Strip trademark/copyright symbols (™ ® ©)
* 2. Strip leading "THE " / "THE" prefix
* 3. Replace punctuation (- : ' ") with spaces
* 4. Collapse runs of whitespace
* 5. Lowercase and trim
*/
export function normalizeRideName(name: string): string {
return name
.replace(/[™®©]/g, "")
.replace(/^the\s+/i, "")
.replace(/[-:'".]/g, " ")
.replace(/\s+/g, " ")
.toLowerCase()
.trim();
}
/**
* Returns a Set of normalized coaster names for fast membership checks.
* Returns null when no coaster data exists for the park.
*/
export function getCoasterSet(parkId: string, meta: ParkMetaMap): Set<string> | null {
const entry = meta[parkId];
if (!entry || entry.coasters.length === 0) return null;
return new Set(entry.coasters.map(normalizeRideName));
}

View File

@@ -9,6 +9,20 @@
const BASE = "https://queue-times.com/parks";
/**
* Normalize a ride name for fuzzy matching between Queue-Times and RCDB.
* Strips trademark symbols, leading "THE ", and punctuation before comparing.
*/
function normalize(name: string): string {
return name
.replace(/[™®©]/g, "")
.replace(/^the\s+/i, "")
.replace(/[-:'".]/g, " ")
.replace(/\s+/g, " ")
.toLowerCase()
.trim();
}
const HEADERS = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
@@ -21,7 +35,7 @@ export interface LiveRide {
isOpen: boolean;
waitMinutes: number;
lastUpdated: string; // ISO 8601
/** True when Queue-Times placed this ride in a "Coasters" land category. */
/** True when the ride name appears in the RCDB coaster list for this park. */
isCoaster: boolean;
}
@@ -58,11 +72,16 @@ interface QTResponse {
* - The request fails
* - The response contains no rides
*
* Pass coasterNames (from RCDB static data) to classify rides accurately.
* Matching is case-insensitive. When coasterNames is null no ride is
* classified as a coaster and the "Coasters only" toggle is hidden.
*
* Pass revalidate (seconds) to control Next.js ISR cache lifetime.
* Defaults to 300s (5 min) to match Queue-Times update frequency.
*/
export async function fetchLiveRides(
queueTimesId: number,
coasterNames: Set<string> | null = null,
revalidate = 300,
): Promise<LiveRidesResult | null> {
const url = `${BASE}/${queueTimesId}/queue_times.json`;
@@ -78,10 +97,7 @@ export async function fetchLiveRides(
const rides: LiveRide[] = [];
// Rides are nested inside lands. Queue-Times labels coaster sections
// with names like "Coasters", "Steel Coasters", "Wooden Coasters", etc.
for (const land of json.lands ?? []) {
const isCoaster = land.name.toLowerCase().includes("coaster");
for (const r of land.rides ?? []) {
if (!r.name) continue;
rides.push({
@@ -89,7 +105,7 @@ export async function fetchLiveRides(
isOpen: r.is_open,
waitMinutes: r.wait_time ?? 0,
lastUpdated: r.last_updated,
isCoaster,
isCoaster: coasterNames ? coasterNames.has(normalize(r.name)) : false,
});
}
}
@@ -102,7 +118,7 @@ export async function fetchLiveRides(
isOpen: r.is_open,
waitMinutes: r.wait_time ?? 0,
lastUpdated: r.last_updated,
isCoaster: false,
isCoaster: coasterNames ? coasterNames.has(normalize(r.name)) : false,
});
}

91
lib/scrapers/rcdb.ts Normal file
View File

@@ -0,0 +1,91 @@
/**
* RCDB (Roller Coaster DataBase) scraper.
*
* Fetches a park's RCDB page (https://rcdb.com/{id}.htm) and extracts the
* names of operating roller coasters from the "Operating Roller Coasters"
* section.
*
* RCDB has no public API. This scraper reads the static HTML page.
* Please scrape infrequently (30-day staleness window) to be respectful.
*/
const BASE = "https://rcdb.com";
const HEADERS = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
Accept: "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
};
/**
* Scrape operating roller coaster names for a park.
*
* Returns an array of coaster names on success, or null when the page
* cannot be fetched or contains no operating coasters.
*/
export async function scrapeRcdbCoasters(rcdbId: number): Promise<string[] | null> {
const url = `${BASE}/${rcdbId}.htm`;
try {
const res = await fetch(url, { headers: HEADERS });
if (!res.ok) {
console.error(` RCDB ${rcdbId}: HTTP ${res.status}`);
return null;
}
const html = await res.text();
return parseOperatingCoasters(html);
} catch (err) {
console.error(` RCDB ${rcdbId}: ${err}`);
return null;
}
}
/**
* Parse operating roller coaster names from RCDB park page HTML.
*
* RCDB park pages list coasters in sections bounded by <section> tags.
* The operating section heading looks like:
* <h4>Operating Roller Coasters: <a href="...">16</a></h4>
*
* Each coaster is an <a> link to its detail page with an unquoted href:
* <td data-sort="Batman The Ride"><a href=/5.htm>Batman The Ride</a>
*
* We extract only those links (href=/DIGITS.htm) from within the
* operating section, stopping at the next <section> tag.
*/
function parseOperatingCoasters(html: string): string[] {
// Find the "Operating Roller Coasters" section heading.
const opIdx = html.search(/Operating\s+Roller\s+Coasters/i);
if (opIdx === -1) return [];
// The section ends at the next <section> tag (e.g. "Defunct Roller Coasters").
const after = html.slice(opIdx);
const nextSection = after.search(/<section\b/i);
const sectionHtml = nextSection > 0 ? after.slice(0, nextSection) : after;
// Extract coaster names from links to RCDB detail pages.
// RCDB uses unquoted href attributes: href=/1234.htm
// General links (/g.htm, /r.htm, /location.htm, etc.) won't match \d+\.htm.
const names: string[] = [];
const linkPattern = /<a\s[^>]*href=["']?\/(\d+)\.htm["']?[^>]*>([^<]+)<\/a>/gi;
let match: RegExpExecArray | null;
while ((match = linkPattern.exec(sectionHtml)) !== null) {
const name = decodeHtmlEntities(match[2].trim());
if (name) names.push(name);
}
// Deduplicate while preserving order
return [...new Set(names)];
}
function decodeHtmlEntities(text: string): string {
return text
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
.replace(/&[a-z]+;/gi, "");
}