fix: preserve historical day records, skip scraping past months
upsertDay: add WHERE park_days.date >= date('now') to the ON CONFLICT
DO UPDATE clause. Past dates now behave as INSERT OR IGNORE — new rows
are written freely but existing historical records are never overwritten.
The API stops returning elapsed dates, so the DB row is the permanent
source of truth for any date that has already occurred.
isMonthScraped: months whose last day is before today are permanently
skipped regardless of staleness age. The API has no data for past months
so re-scraping them wastes API calls and cannot improve the records.
Current and future months continue to use the 7-day staleness window.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
26
lib/db.ts
26
lib/db.ts
@@ -46,6 +46,11 @@ export function upsertDay(
|
||||
hoursLabel?: string,
|
||||
specialType?: string
|
||||
) {
|
||||
// For past dates: INSERT new rows freely, but never overwrite existing records.
|
||||
// The API stops returning past dates once they've elapsed, so the DB row is the
|
||||
// permanent historical truth — we must not let a future scrape clobber it.
|
||||
//
|
||||
// For today and future dates: full upsert — the schedule can still change.
|
||||
db.prepare(`
|
||||
INSERT INTO park_days (park_id, date, is_open, hours_label, special_type, scraped_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
@@ -54,6 +59,7 @@ export function upsertDay(
|
||||
hours_label = excluded.hours_label,
|
||||
special_type = excluded.special_type,
|
||||
scraped_at = excluded.scraped_at
|
||||
WHERE park_days.date >= date('now')
|
||||
`).run(parkId, date, isOpen ? 1 : 0, hoursLabel ?? null, specialType ?? null, new Date().toISOString());
|
||||
}
|
||||
|
||||
@@ -160,16 +166,32 @@ export function getMonthCalendar(
|
||||
return result;
|
||||
}
|
||||
|
||||
/** True if the DB already has at least one row for this park+month. */
|
||||
const STALE_AFTER_MS = 7 * 24 * 60 * 60 * 1000; // 1 week
|
||||
|
||||
/** True if the DB has data for this park+month scraped within the last week. */
|
||||
/**
|
||||
* Returns true when the scraper should skip this park+month.
|
||||
*
|
||||
* Two reasons to skip:
|
||||
* 1. The month is entirely in the past — the API will never return data for
|
||||
* those dates again, so re-scraping wastes a call and risks nothing but
|
||||
* wasted time. Historical records are preserved forever by upsertDay.
|
||||
* 2. The month was scraped within the last 7 days — data is still fresh.
|
||||
*/
|
||||
export function isMonthScraped(
|
||||
db: Database.Database,
|
||||
parkId: string,
|
||||
year: number,
|
||||
month: number
|
||||
): boolean {
|
||||
// Compute the last calendar day of this month (avoids timezone issues).
|
||||
const daysInMonth = new Date(year, month, 0).getDate();
|
||||
const lastDay = `${year}-${String(month).padStart(2, "0")}-${String(daysInMonth).padStart(2, "0")}`;
|
||||
const today = new Date().toISOString().slice(0, 10);
|
||||
|
||||
// Past month — history is locked in, no API data available, always skip.
|
||||
if (lastDay < today) return true;
|
||||
|
||||
// Current/future month — skip only if recently scraped.
|
||||
const prefix = `${year}-${String(month).padStart(2, "0")}`;
|
||||
const row = db
|
||||
.prepare(
|
||||
|
||||
Reference in New Issue
Block a user