SixFlagsSuperCalendar/lib/scrapers/rcdb.ts

/**
 * RCDB (Roller Coaster DataBase) scraper.
 *
 * Fetches a park's RCDB page (https://rcdb.com/{id}.htm) and extracts the
 * names of operating roller coasters from the "Operating Roller Coasters"
 * section.
 *
 * RCDB has no public API. This scraper reads the static HTML page.
 * Please scrape infrequently (30-day staleness window) to be respectful.
 */

const BASE = "https://rcdb.com";

const HEADERS = {
  "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
  Accept: "text/html,application/xhtml+xml",
  "Accept-Language": "en-US,en;q=0.9",
};

/**
 * Scrape operating roller coaster names for a park.
 *
 * Returns an array of coaster names on success, or null when the page
 * cannot be fetched or contains no operating coasters.
 */
export async function scrapeRcdbCoasters(rcdbId: number): Promise<string[] | null> {
  const url = `${BASE}/${rcdbId}.htm`;
  try {
    const res = await fetch(url, { headers: HEADERS, signal: AbortSignal.timeout(15_000) });
    if (!res.ok) {
      console.error(`  RCDB ${rcdbId}: HTTP ${res.status}`);
      return null;
    }
    const html = await res.text();
    return parseOperatingCoasters(html);
  } catch (err) {
    console.error(`  RCDB ${rcdbId}: ${err}`);
    return null;
  }
}

/**
 * Parse operating roller coaster names from RCDB park page HTML.
 *
 * RCDB park pages list coasters in sections bounded by <section> tags.
 * The operating section heading looks like:
 *   <h4>Operating Roller Coasters: <a href="...">16</a></h4>
 *
 * Each coaster is an <a> link to its detail page with an unquoted href:
 *   <td data-sort="Batman The Ride"><a href=/5.htm>Batman The Ride</a>
 *
 * We extract only those links (href=/DIGITS.htm) from within the
 * operating section, stopping at the next <section> tag.
 */
function parseOperatingCoasters(html: string): string[] {
  // Find the "Operating Roller Coasters" section heading.
  const opIdx = html.search(/Operating\s+Roller\s+Coasters/i);
  if (opIdx === -1) return [];

  // The section ends at the next <section> tag (e.g. "Defunct Roller Coasters").
  const after = html.slice(opIdx);
  const nextSection = after.search(/<section\b/i);
  const sectionHtml = nextSection > 0 ? after.slice(0, nextSection) : after;

  // Extract coaster names from links to RCDB detail pages.
  // RCDB uses unquoted href attributes: href=/1234.htm
  // General links (/g.htm, /r.htm, /location.htm, etc.) won't match \d+\.htm.
  const names: string[] = [];
  const linkPattern = /<a\s[^>]*href=["']?\/(\d+)\.htm["']?[^>]*>([^<]+)<\/a>/gi;
  let match: RegExpExecArray | null;

  while ((match = linkPattern.exec(sectionHtml)) !== null) {
    const name = decodeHtmlEntities(match[2].trim());
    if (name) names.push(name);
  }

  // Deduplicate while preserving order
  return [...new Set(names)];
}

function decodeHtmlEntities(text: string): string {
  return text
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"')
    .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
    .replace(/&[a-z]+;/gi, "");
}