feat: RCDB-backed roller coaster filter with fuzzy name matching

- Add lib/park-meta.ts to manage data/park-meta.json (rcdb_id + coaster lists) - Add lib/scrapers/rcdb.ts to scrape operating coaster names from RCDB park pages - discover.ts now seeds park-meta.json with skeleton entries for all parks - scrape.ts now refreshes RCDB coaster lists (30-day staleness) for parks with rcdb_id set - fetchLiveRides() accepts a coasterNames Set; isCoaster uses normalize() on both sides to handle trademark symbols, 'THE ' prefixes, and punctuation differences between Queue-Times and RCDB names — applies correctly to both land rides and top-level rides - Commit park-meta.json so it ships in the Docker image (fresh volumes get it automatically) - Update .gitignore / .dockerignore to exclude only *.db files, not all of data/ - Dockerfile copies park-meta.json into image before VOLUME declaration - README: document coaster filter setup and correct staleness window (72h not 7d) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 13:49:49 -04:00
parent 819e716197
commit 9700d0bd9a
11 changed files with 710 additions and 15 deletions
@@ -0,0 +1,91 @@
+/**
+ * RCDB (Roller Coaster DataBase) scraper.
+ *
+ * Fetches a park's RCDB page (https://rcdb.com/{id}.htm) and extracts the
+ * names of operating roller coasters from the "Operating Roller Coasters"
+ * section.
+ *
+ * RCDB has no public API. This scraper reads the static HTML page.
+ * Please scrape infrequently (30-day staleness window) to be respectful.
+ */
+
+const BASE = "https://rcdb.com";
+
+const HEADERS = {
+  "User-Agent":
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
+    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+  Accept: "text/html,application/xhtml+xml",
+  "Accept-Language": "en-US,en;q=0.9",
+};
+
+/**
+ * Scrape operating roller coaster names for a park.
+ *
+ * Returns an array of coaster names on success, or null when the page
+ * cannot be fetched or contains no operating coasters.
+ */
+export async function scrapeRcdbCoasters(rcdbId: number): Promise<string[] | null> {
+  const url = `${BASE}/${rcdbId}.htm`;
+  try {
+    const res = await fetch(url, { headers: HEADERS });
+    if (!res.ok) {
+      console.error(`  RCDB ${rcdbId}: HTTP ${res.status}`);
+      return null;
+    }
+    const html = await res.text();
+    return parseOperatingCoasters(html);
+  } catch (err) {
+    console.error(`  RCDB ${rcdbId}: ${err}`);
+    return null;
+  }
+}
+
+/**
+ * Parse operating roller coaster names from RCDB park page HTML.
+ *
+ * RCDB park pages list coasters in sections bounded by <section> tags.
+ * The operating section heading looks like:
+ *   <h4>Operating Roller Coasters: <a href="...">16</a></h4>
+ *
+ * Each coaster is an <a> link to its detail page with an unquoted href:
+ *   <td data-sort="Batman The Ride"><a href=/5.htm>Batman The Ride</a>
+ *
+ * We extract only those links (href=/DIGITS.htm) from within the
+ * operating section, stopping at the next <section> tag.
+ */
+function parseOperatingCoasters(html: string): string[] {
+  // Find the "Operating Roller Coasters" section heading.
+  const opIdx = html.search(/Operating\s+Roller\s+Coasters/i);
+  if (opIdx === -1) return [];
+
+  // The section ends at the next <section> tag (e.g. "Defunct Roller Coasters").
+  const after = html.slice(opIdx);
+  const nextSection = after.search(/<section\b/i);
+  const sectionHtml = nextSection > 0 ? after.slice(0, nextSection) : after;
+
+  // Extract coaster names from links to RCDB detail pages.
+  // RCDB uses unquoted href attributes: href=/1234.htm
+  // General links (/g.htm, /r.htm, /location.htm, etc.) won't match \d+\.htm.
+  const names: string[] = [];
+  const linkPattern = /<a\s[^>]*href=["']?\/(\d+)\.htm["']?[^>]*>([^<]+)<\/a>/gi;
+  let match: RegExpExecArray | null;
+
+  while ((match = linkPattern.exec(sectionHtml)) !== null) {
+    const name = decodeHtmlEntities(match[2].trim());
+    if (name) names.push(name);
+  }
+
+  // Deduplicate while preserving order
+  return [...new Set(names)];
+}
+
+function decodeHtmlEntities(text: string): string {
+  return text
+    .replace(/&amp;/g, "&")
+    .replace(/&lt;/g, "<")
+    .replace(/&gt;/g, ">")
+    .replace(/&quot;/g, '"')
+    .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
+    .replace(/&[a-z]+;/gi, "");
+}