#!/bin/sh # Nightly scraper scheduler — runs inside the Docker scraper service. # # Behaviour: # 1. Runs an initial scrape immediately on container start. # 2. Sleeps until 3:00 AM (container timezone, set via TZ env var). # 3. Runs the scraper, then sleeps until the next 3:00 AM, forever. # # Timezone: set TZ in the scraper service environment to control when # "3am" is (e.g. TZ=America/New_York). Defaults to UTC if unset. log() { echo "[scheduler] $(date '+%Y-%m-%d %H:%M %Z') — $*" } run_scrape() { log "Starting scrape" if npm run scrape; then log "Scrape completed" else log "Scrape failed — will retry at next scheduled time" fi } seconds_until_3am() { now=$(date +%s) # Try today's 3am first; if already past, use tomorrow's. target=$(date -d "today 03:00" +%s) if [ "$now" -ge "$target" ]; then target=$(date -d "tomorrow 03:00" +%s) fi echo $((target - now)) } # ── Run immediately on startup ──────────────────────────────────────────────── run_scrape # ── Nightly loop ────────────────────────────────────────────────────────────── while true; do wait=$(seconds_until_3am) next=$(date -d "now + ${wait} seconds" '+%Y-%m-%d %H:%M %Z') log "Next scrape in $((wait / 3600))h $((( wait % 3600) / 60))m (${next})" sleep "$wait" run_scrape done