feat: RCDB-backed roller coaster filter with fuzzy name matching
All checks were successful
Build and Deploy / Build & Push (push) Successful in 2m54s
All checks were successful
Build and Deploy / Build & Push (push) Successful in 2m54s
- Add lib/park-meta.ts to manage data/park-meta.json (rcdb_id + coaster lists) - Add lib/scrapers/rcdb.ts to scrape operating coaster names from RCDB park pages - discover.ts now seeds park-meta.json with skeleton entries for all parks - scrape.ts now refreshes RCDB coaster lists (30-day staleness) for parks with rcdb_id set - fetchLiveRides() accepts a coasterNames Set; isCoaster uses normalize() on both sides to handle trademark symbols, 'THE ' prefixes, and punctuation differences between Queue-Times and RCDB names — applies correctly to both land rides and top-level rides - Commit park-meta.json so it ships in the Docker image (fresh volumes get it automatically) - Update .gitignore / .dockerignore to exclude only *.db files, not all of data/ - Dockerfile copies park-meta.json into image before VOLUME declaration - README: document coaster filter setup and correct staleness window (72h not 7d) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,20 @@
|
||||
|
||||
const BASE = "https://queue-times.com/parks";
|
||||
|
||||
/**
|
||||
* Normalize a ride name for fuzzy matching between Queue-Times and RCDB.
|
||||
* Strips trademark symbols, leading "THE ", and punctuation before comparing.
|
||||
*/
|
||||
function normalize(name: string): string {
|
||||
return name
|
||||
.replace(/[™®©]/g, "")
|
||||
.replace(/^the\s+/i, "")
|
||||
.replace(/[-:'".]/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.toLowerCase()
|
||||
.trim();
|
||||
}
|
||||
|
||||
const HEADERS = {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
||||
@@ -21,7 +35,7 @@ export interface LiveRide {
|
||||
isOpen: boolean;
|
||||
waitMinutes: number;
|
||||
lastUpdated: string; // ISO 8601
|
||||
/** True when Queue-Times placed this ride in a "Coasters" land category. */
|
||||
/** True when the ride name appears in the RCDB coaster list for this park. */
|
||||
isCoaster: boolean;
|
||||
}
|
||||
|
||||
@@ -58,11 +72,16 @@ interface QTResponse {
|
||||
* - The request fails
|
||||
* - The response contains no rides
|
||||
*
|
||||
* Pass coasterNames (from RCDB static data) to classify rides accurately.
|
||||
* Matching is case-insensitive. When coasterNames is null no ride is
|
||||
* classified as a coaster and the "Coasters only" toggle is hidden.
|
||||
*
|
||||
* Pass revalidate (seconds) to control Next.js ISR cache lifetime.
|
||||
* Defaults to 300s (5 min) to match Queue-Times update frequency.
|
||||
*/
|
||||
export async function fetchLiveRides(
|
||||
queueTimesId: number,
|
||||
coasterNames: Set<string> | null = null,
|
||||
revalidate = 300,
|
||||
): Promise<LiveRidesResult | null> {
|
||||
const url = `${BASE}/${queueTimesId}/queue_times.json`;
|
||||
@@ -78,10 +97,7 @@ export async function fetchLiveRides(
|
||||
|
||||
const rides: LiveRide[] = [];
|
||||
|
||||
// Rides are nested inside lands. Queue-Times labels coaster sections
|
||||
// with names like "Coasters", "Steel Coasters", "Wooden Coasters", etc.
|
||||
for (const land of json.lands ?? []) {
|
||||
const isCoaster = land.name.toLowerCase().includes("coaster");
|
||||
for (const r of land.rides ?? []) {
|
||||
if (!r.name) continue;
|
||||
rides.push({
|
||||
@@ -89,7 +105,7 @@ export async function fetchLiveRides(
|
||||
isOpen: r.is_open,
|
||||
waitMinutes: r.wait_time ?? 0,
|
||||
lastUpdated: r.last_updated,
|
||||
isCoaster,
|
||||
isCoaster: coasterNames ? coasterNames.has(normalize(r.name)) : false,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -102,7 +118,7 @@ export async function fetchLiveRides(
|
||||
isOpen: r.is_open,
|
||||
waitMinutes: r.wait_time ?? 0,
|
||||
lastUpdated: r.last_updated,
|
||||
isCoaster: false,
|
||||
isCoaster: coasterNames ? coasterNames.has(normalize(r.name)) : false,
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
91
lib/scrapers/rcdb.ts
Normal file
91
lib/scrapers/rcdb.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
/**
|
||||
* RCDB (Roller Coaster DataBase) scraper.
|
||||
*
|
||||
* Fetches a park's RCDB page (https://rcdb.com/{id}.htm) and extracts the
|
||||
* names of operating roller coasters from the "Operating Roller Coasters"
|
||||
* section.
|
||||
*
|
||||
* RCDB has no public API. This scraper reads the static HTML page.
|
||||
* Please scrape infrequently (30-day staleness window) to be respectful.
|
||||
*/
|
||||
|
||||
const BASE = "https://rcdb.com";
|
||||
|
||||
const HEADERS = {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
};
|
||||
|
||||
/**
|
||||
* Scrape operating roller coaster names for a park.
|
||||
*
|
||||
* Returns an array of coaster names on success, or null when the page
|
||||
* cannot be fetched or contains no operating coasters.
|
||||
*/
|
||||
export async function scrapeRcdbCoasters(rcdbId: number): Promise<string[] | null> {
|
||||
const url = `${BASE}/${rcdbId}.htm`;
|
||||
try {
|
||||
const res = await fetch(url, { headers: HEADERS });
|
||||
if (!res.ok) {
|
||||
console.error(` RCDB ${rcdbId}: HTTP ${res.status}`);
|
||||
return null;
|
||||
}
|
||||
const html = await res.text();
|
||||
return parseOperatingCoasters(html);
|
||||
} catch (err) {
|
||||
console.error(` RCDB ${rcdbId}: ${err}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse operating roller coaster names from RCDB park page HTML.
|
||||
*
|
||||
* RCDB park pages list coasters in sections bounded by <section> tags.
|
||||
* The operating section heading looks like:
|
||||
* <h4>Operating Roller Coasters: <a href="...">16</a></h4>
|
||||
*
|
||||
* Each coaster is an <a> link to its detail page with an unquoted href:
|
||||
* <td data-sort="Batman The Ride"><a href=/5.htm>Batman The Ride</a>
|
||||
*
|
||||
* We extract only those links (href=/DIGITS.htm) from within the
|
||||
* operating section, stopping at the next <section> tag.
|
||||
*/
|
||||
function parseOperatingCoasters(html: string): string[] {
|
||||
// Find the "Operating Roller Coasters" section heading.
|
||||
const opIdx = html.search(/Operating\s+Roller\s+Coasters/i);
|
||||
if (opIdx === -1) return [];
|
||||
|
||||
// The section ends at the next <section> tag (e.g. "Defunct Roller Coasters").
|
||||
const after = html.slice(opIdx);
|
||||
const nextSection = after.search(/<section\b/i);
|
||||
const sectionHtml = nextSection > 0 ? after.slice(0, nextSection) : after;
|
||||
|
||||
// Extract coaster names from links to RCDB detail pages.
|
||||
// RCDB uses unquoted href attributes: href=/1234.htm
|
||||
// General links (/g.htm, /r.htm, /location.htm, etc.) won't match \d+\.htm.
|
||||
const names: string[] = [];
|
||||
const linkPattern = /<a\s[^>]*href=["']?\/(\d+)\.htm["']?[^>]*>([^<]+)<\/a>/gi;
|
||||
let match: RegExpExecArray | null;
|
||||
|
||||
while ((match = linkPattern.exec(sectionHtml)) !== null) {
|
||||
const name = decodeHtmlEntities(match[2].trim());
|
||||
if (name) names.push(name);
|
||||
}
|
||||
|
||||
// Deduplicate while preserving order
|
||||
return [...new Set(names)];
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(text: string): string {
|
||||
return text
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, '"')
|
||||
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
|
||||
.replace(/&[a-z]+;/gi, "");
|
||||
}
|
||||
Reference in New Issue
Block a user