/** * RCDB (Roller Coaster DataBase) scraper. * * Fetches a park's RCDB page (https://rcdb.com/{id}.htm) and extracts the * names of operating roller coasters from the "Operating Roller Coasters" * section. * * RCDB has no public API. This scraper reads the static HTML page. * Please scrape infrequently (30-day staleness window) to be respectful. */ const BASE = "https://rcdb.com"; const HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", Accept: "text/html,application/xhtml+xml", "Accept-Language": "en-US,en;q=0.9", }; /** * Scrape operating roller coaster names for a park. * * Returns an array of coaster names on success, or null when the page * cannot be fetched or contains no operating coasters. */ export async function scrapeRcdbCoasters(rcdbId: number): Promise { const url = `${BASE}/${rcdbId}.htm`; try { const res = await fetch(url, { headers: HEADERS }); if (!res.ok) { console.error(` RCDB ${rcdbId}: HTTP ${res.status}`); return null; } const html = await res.text(); return parseOperatingCoasters(html); } catch (err) { console.error(` RCDB ${rcdbId}: ${err}`); return null; } } /** * Parse operating roller coaster names from RCDB park page HTML. * * RCDB park pages list coasters in sections bounded by
tags. * The operating section heading looks like: *

Operating Roller Coasters: 16

* * Each coaster is an link to its detail page with an unquoted href: * Batman The Ride * * We extract only those links (href=/DIGITS.htm) from within the * operating section, stopping at the next
tag. */ function parseOperatingCoasters(html: string): string[] { // Find the "Operating Roller Coasters" section heading. const opIdx = html.search(/Operating\s+Roller\s+Coasters/i); if (opIdx === -1) return []; // The section ends at the next
tag (e.g. "Defunct Roller Coasters"). const after = html.slice(opIdx); const nextSection = after.search(/ 0 ? after.slice(0, nextSection) : after; // Extract coaster names from links to RCDB detail pages. // RCDB uses unquoted href attributes: href=/1234.htm // General links (/g.htm, /r.htm, /location.htm, etc.) won't match \d+\.htm. const names: string[] = []; const linkPattern = /]*href=["']?\/(\d+)\.htm["']?[^>]*>([^<]+)<\/a>/gi; let match: RegExpExecArray | null; while ((match = linkPattern.exec(sectionHtml)) !== null) { const name = decodeHtmlEntities(match[2].trim()); if (name) names.push(name); } // Deduplicate while preserving order return [...new Set(names)]; } function decodeHtmlEntities(text: string): string { return text .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10))) .replace(/&[a-z]+;/gi, ""); }