feat: initial project scaffold with CI/CD and Docker deployment

Next.js 15 + Tailwind CSS v4 week calendar showing Six Flags park hours. Scrapes the internal CloudFront API, stores results in SQLite. Includes Dockerfile (Debian/Playwright-compatible), docker-compose, and Gitea Actions pipeline that builds and pushes to the container registry. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-04 00:48:09 -04:00
parent af6aa29474
commit 548c7ae09e
26 changed files with 9602 additions and 0 deletions
@@ -0,0 +1,140 @@
+/**
+ * One-time discovery script — finds the CloudFront API ID for each park.
+ *
+ * Run this once before using scrape.ts:
+ *   npx tsx scripts/discover.ts
+ *
+ * For each park in the registry it:
+ *   1. Opens the park's hours page in a headless browser
+ *   2. Intercepts all calls to the operating-hours CloudFront API
+ *   3. Identifies the main theme park ID (filters out water parks, safari, etc.)
+ *   4. Stores the ID in the database
+ *
+ * Re-running is safe — already-discovered parks are skipped.
+ */
+
+import { chromium } from "playwright";
+import { openDb, getApiId, setApiId, type DbInstance } from "../lib/db";
+import { PARKS } from "../lib/parks";
+import { fetchParkInfo, isMainThemePark } from "../lib/scrapers/sixflags";
+
+const CLOUDFRONT_PATTERN = /operating-hours\/park\/(\d+)/;
+
+async function discoverParkId(slug: string): Promise<number | null> {
+  const browser = await chromium.launch({ headless: true });
+  try {
+    const context = await browser.newContext({
+      userAgent:
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+      locale: "en-US",
+    });
+    const page = await context.newPage();
+
+    const capturedIds = new Set<number>();
+    page.on("request", (req) => {
+      const match = req.url().match(CLOUDFRONT_PATTERN);
+      if (match) capturedIds.add(parseInt(match[1]));
+    });
+
+    await page
+      .goto(`https://www.sixflags.com/${slug}/park-hours?date=2026-05-01`, {
+        waitUntil: "networkidle",
+        timeout: 30_000,
+      })
+      .catch(() => null);
+
+    await context.close();
+
+    if (capturedIds.size === 0) return null;
+
+    // Check each captured ID — pick the main theme park (not water park / safari)
+    for (const id of capturedIds) {
+      const info = await fetchParkInfo(id);
+      if (info && isMainThemePark(info.parkName)) {
+        console.log(
+          `    → ID ${id} | ${info.parkAbbreviation} | ${info.parkName}`
+        );
+        return id;
+      }
+    }
+
+    // Fallback: return the lowest ID (usually the main park)
+    const fallback = Math.min(...capturedIds);
+    console.log(`    → fallback to lowest ID: ${fallback}`);
+    return fallback;
+  } finally {
+    await browser.close();
+  }
+}
+
+function purgeRemovedParks(db: DbInstance) {
+  const knownIds = new Set(PARKS.map((p) => p.id));
+
+  const staleParkIds = (
+    db.prepare("SELECT DISTINCT park_id FROM park_api_ids").all() as { park_id: string }[]
+  )
+    .map((r) => r.park_id)
+    .filter((id) => !knownIds.has(id));
+
+  if (staleParkIds.length === 0) return;
+
+  console.log(`\nRemoving ${staleParkIds.length} park(s) no longer in registry:`);
+  for (const parkId of staleParkIds) {
+    const days = (
+      db.prepare("SELECT COUNT(*) AS n FROM park_days WHERE park_id = ?").get(parkId) as { n: number }
+    ).n;
+    db.prepare("DELETE FROM park_days WHERE park_id = ?").run(parkId);
+    db.prepare("DELETE FROM park_api_ids WHERE park_id = ?").run(parkId);
+    console.log(`  removed ${parkId} (${days} day rows deleted)`);
+  }
+  console.log();
+}
+
+async function main() {
+  const db = openDb();
+
+  purgeRemovedParks(db);
+
+  for (const park of PARKS) {
+    const existing = getApiId(db, park.id);
+    if (existing !== null) {
+      console.log(`${park.name}: already known (API ID ${existing}) — skip`);
+      continue;
+    }
+
+    process.stdout.write(`${park.name} (${park.slug})... `);
+
+    try {
+      const apiId = await discoverParkId(park.slug);
+      if (apiId === null) {
+        console.log("FAILED — no API IDs captured");
+        continue;
+      }
+
+      // Fetch full info to store name/abbreviation
+      const info = await fetchParkInfo(apiId);
+      setApiId(db, park.id, apiId, info?.parkAbbreviation, info?.parkName);
+      console.log(`done (ID ${apiId})`);
+    } catch (err) {
+      console.log(`ERROR: ${err}`);
+    }
+
+    // Small delay between parks to be polite
+    await new Promise((r) => setTimeout(r, 2000));
+  }
+
+  // Print summary
+  console.log("\n── Discovered IDs ──");
+  for (const park of PARKS) {
+    const id = getApiId(db, park.id);
+    console.log(`  ${park.id.padEnd(30)} ${id ?? "NOT FOUND"}`);
+  }
+
+  db.close();
+}
+
+main().catch((err) => {
+  console.error("Fatal:", err);
+  process.exit(1);
+});
@@ -0,0 +1,126 @@
+/**
+ * Scrape job — fetches 2026 operating hours for all parks from the Six Flags API.
+ *
+ * Prerequisite: run `npm run discover` first to populate API IDs.
+ *
+ * Run once and leave it:
+ *   npm run scrape
+ *
+ * Skips park+month combos scraped within the last week. Re-run to resume after interruption.
+ * To force a full re-scrape:
+ *   npm run scrape:force
+ *
+ * Rate limiting: backs off automatically (30s → 60s → 120s per retry).
+ * After exhausting retries, skips that park+month and continues.
+ */
+
+import { openDb, upsertDay, getApiId, isMonthScraped } from "../lib/db";
+import { PARKS } from "../lib/parks";
+import { scrapeMonth, RateLimitError } from "../lib/scrapers/sixflags";
+
+const YEAR = 2026;
+const MONTHS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
+const DELAY_MS = 1000; // between successful API calls
+const FORCE = process.argv.includes("--rescrape");
+
+function monthLabel(m: number) {
+  return `${YEAR}-${String(m).padStart(2, "0")}`;
+}
+
+function pad(n: number, width: number) {
+  return String(n).padStart(width, " ");
+}
+
+async function sleep(ms: number) {
+  return new Promise<void>((r) => setTimeout(r, ms));
+}
+
+async function main() {
+  const db = openDb();
+
+  // Separate parks with known API IDs from those needing discovery
+  const ready = PARKS.filter((p) => getApiId(db, p.id) !== null);
+  const needsDiscovery = PARKS.filter((p) => getApiId(db, p.id) === null);
+
+  if (needsDiscovery.length > 0) {
+    console.log(
+      `⚠  ${needsDiscovery.length} parks have no API ID — run \`npm run discover\` first:\n` +
+        needsDiscovery.map((p) => `     ${p.id}`).join("\n") +
+        "\n"
+    );
+  }
+
+  if (ready.length === 0) {
+    console.log("No parks ready to scrape. Run: npm run discover");
+    db.close();
+    return;
+  }
+
+  // Build the full work queue: month × park
+  const queue: { month: number; park: (typeof PARKS)[0]; apiId: number }[] = [];
+  for (const month of MONTHS) {
+    for (const park of ready) {
+      if (!FORCE && isMonthScraped(db, park.id, YEAR, month)) continue;
+      queue.push({ month, park, apiId: getApiId(db, park.id)! });
+    }
+  }
+
+  const total = MONTHS.length * ready.length;
+  const skip = total - queue.length;
+  console.log(
+    `Scraping ${YEAR} — ${ready.length} parks × 12 months = ${total} total\n` +
+      `Skipping ${skip} already-scraped. ${queue.length} to fetch.\n`
+  );
+
+  if (queue.length === 0) {
+    console.log("Nothing to do. To force a full re-scrape: npm run scrape:force");
+    db.close();
+    return;
+  }
+
+  let done = 0;
+  let errors = 0;
+
+  for (const { month, park, apiId } of queue) {
+    const counter = `[${pad(done + 1, queue.length.toString().length)}/${queue.length}]`;
+    process.stdout.write(`${counter} ${park.shortName.padEnd(22)} ${monthLabel(month)} ... `);
+
+    try {
+      const days = await scrapeMonth(apiId, YEAR, month);
+      const insertAll = db.transaction(() => {
+        for (const d of days) upsertDay(db, park.id, d.date, d.isOpen, d.hoursLabel);
+      });
+      insertAll();
+
+      const openCount = days.filter((d) => d.isOpen).length;
+      console.log(`${openCount}/${days.length} open`);
+      done++;
+
+      if (done < queue.length) await sleep(DELAY_MS);
+    } catch (err) {
+      if (err instanceof RateLimitError) {
+        console.log(`RATE LIMITED — skipping (re-run to retry)`);
+      } else {
+        console.log(`ERROR: ${err instanceof Error ? err.message : err}`);
+      }
+      errors++;
+    }
+  }
+
+  const summary = [
+    `\n── Summary ─────────────────────────────`,
+    `   Fetched : ${done}`,
+    `   Skipped : ${skip}`,
+    `   Errors  : ${errors}`,
+    `   Total   : ${total}`,
+  ];
+  if (errors > 0) summary.push(`\nRe-run to retry failed months.`);
+  console.log(summary.join("\n"));
+
+  db.close();
+}
+
+main().catch((err) => {
+  console.error("Fatal:", err);
+  process.exit(1);
+});