Compare commits
3 Commits
dc4fbeb7ec
...
766fc296a1
| Author | SHA1 | Date | |
|---|---|---|---|
| 766fc296a1 | |||
| 8324f31972 | |||
| 9cac86d241 |
64
lib/coaster-match.ts
Normal file
64
lib/coaster-match.ts
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
/**
|
||||||
|
* Coaster name matching — shared between the Queue-Times scraper and tests.
|
||||||
|
*
|
||||||
|
* Queue-Times and RCDB use different name conventions:
|
||||||
|
* - Trademark symbols (™ ® ©)
|
||||||
|
* - Leading "THE " prefixes
|
||||||
|
* - Possessives ("Catwoman's" vs "Catwoman")
|
||||||
|
* - Subtitles added or dropped ("Apocalypse" vs "Apocalypse the Ride")
|
||||||
|
* - Space-split brand words ("BAT GIRL" vs "Batgirl")
|
||||||
|
* - Conjunction-joined compound rides ("Joker y Harley Quinn" ≠ "Joker")
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Words that join two ride names rather than extend one subtitle.
|
||||||
|
// When a prefix match is found and the next word is one of these,
|
||||||
|
// the longer name is a *different* ride, not a subtitle.
|
||||||
|
const CONJUNCTIONS = new Set(["y", "and", "&", "with", "de", "del", "e", "et"]);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize a ride name for matching.
|
||||||
|
* Both sides (Queue-Times and RCDB) must be normalized with this function
|
||||||
|
* before any comparison so the transforms are symmetric.
|
||||||
|
*/
|
||||||
|
export function normalizeForMatch(name: string): string {
|
||||||
|
return name
|
||||||
|
.replace(/[\u2122\u00ae\u00a9™®©]/g, "") // strip ™ ® ©
|
||||||
|
.replace(/^the\s+/i, "") // strip leading "THE "
|
||||||
|
.replace(/['\u2019]s\b/gi, "") // strip possessives ('s / 's)
|
||||||
|
.replace(/[^\w\s]/g, " ") // all remaining punctuation → space
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.toLowerCase()
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true when the Queue-Times ride name matches an entry in the RCDB
|
||||||
|
* coaster set (which must be built with normalizeForMatch).
|
||||||
|
*
|
||||||
|
* Matching strategy (in order):
|
||||||
|
* 1. Exact normalized match.
|
||||||
|
* 2. Compact (space-stripped) match — catches "BAT GIRL" vs "Batgirl".
|
||||||
|
* 3. Prefix match — the shorter normalized name is a prefix of the longer,
|
||||||
|
* minimum 5 chars, unless the next word after the prefix is a conjunction
|
||||||
|
* (which signals a compound ride name, not a subtitle).
|
||||||
|
*/
|
||||||
|
export function isCoasterMatch(qtName: string, coasterSet: Set<string>): boolean {
|
||||||
|
const norm = normalizeForMatch(qtName);
|
||||||
|
if (coasterSet.has(norm)) return true;
|
||||||
|
|
||||||
|
const compact = norm.replace(/\s/g, "");
|
||||||
|
for (const c of coasterSet) {
|
||||||
|
// Compact comparison
|
||||||
|
if (compact.length >= 5 && c.replace(/\s/g, "") === compact) return true;
|
||||||
|
|
||||||
|
// Prefix comparison
|
||||||
|
const shorter = norm.length <= c.length ? norm : c;
|
||||||
|
const longer = norm.length <= c.length ? c : norm;
|
||||||
|
if (shorter.length >= 5 && longer.startsWith(shorter)) {
|
||||||
|
const nextWord = longer.slice(shorter.length).trim().split(" ")[0];
|
||||||
|
if (!CONJUNCTIONS.has(nextWord)) return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
@@ -52,28 +52,8 @@ export function areCoastersStale(entry: ParkMeta): boolean {
|
|||||||
return Date.now() - new Date(entry.coasters_scraped_at).getTime() > COASTER_STALE_MS;
|
return Date.now() - new Date(entry.coasters_scraped_at).getTime() > COASTER_STALE_MS;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
import { normalizeForMatch } from "./coaster-match";
|
||||||
* Normalize a ride name for fuzzy matching between data sources.
|
export { normalizeForMatch as normalizeRideName } from "./coaster-match";
|
||||||
*
|
|
||||||
* Queue-Times uses branded names (BATMAN™ The Ride, THE JOKER™ Funhouse Coaster)
|
|
||||||
* while RCDB uses clean names (Batman The Ride, Joker Funhouse Coaster).
|
|
||||||
*
|
|
||||||
* Normalization steps:
|
|
||||||
* 1. Strip trademark/copyright symbols (™ ® ©)
|
|
||||||
* 2. Strip leading "THE " / "THE" prefix
|
|
||||||
* 3. Replace punctuation (- : ' ") with spaces
|
|
||||||
* 4. Collapse runs of whitespace
|
|
||||||
* 5. Lowercase and trim
|
|
||||||
*/
|
|
||||||
export function normalizeRideName(name: string): string {
|
|
||||||
return name
|
|
||||||
.replace(/[\u2122\u00ae\u00a9™®©]/g, "")
|
|
||||||
.replace(/^the\s+/i, "")
|
|
||||||
.replace(/[^\w\s]/g, " ")
|
|
||||||
.replace(/\s+/g, " ")
|
|
||||||
.toLowerCase()
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a Set of normalized coaster names for fast membership checks.
|
* Returns a Set of normalized coaster names for fast membership checks.
|
||||||
@@ -82,5 +62,5 @@ export function normalizeRideName(name: string): string {
|
|||||||
export function getCoasterSet(parkId: string, meta: ParkMetaMap): Set<string> | null {
|
export function getCoasterSet(parkId: string, meta: ParkMetaMap): Set<string> | null {
|
||||||
const entry = meta[parkId];
|
const entry = meta[parkId];
|
||||||
if (!entry || entry.coasters.length === 0) return null;
|
if (!entry || entry.coasters.length === 0) return null;
|
||||||
return new Set(entry.coasters.map(normalizeRideName));
|
return new Set(entry.coasters.map(normalizeForMatch));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,48 +7,10 @@
|
|||||||
* See: https://queue-times.com/en-US/pages/api
|
* See: https://queue-times.com/en-US/pages/api
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import { isCoasterMatch } from "../coaster-match";
|
||||||
|
|
||||||
const BASE = "https://queue-times.com/parks";
|
const BASE = "https://queue-times.com/parks";
|
||||||
|
|
||||||
/**
|
|
||||||
* Normalize a ride name for fuzzy matching between Queue-Times and RCDB.
|
|
||||||
*
|
|
||||||
* - Strips trademark/copyright symbols (™ ® © and Unicode variants)
|
|
||||||
* - Strips leading "THE " prefix
|
|
||||||
* - Replaces ALL non-word, non-space characters with a space
|
|
||||||
* (handles !, -, :, ', ' U+2019, ", and any other punctuation)
|
|
||||||
* - Collapses whitespace, lowercases, trims
|
|
||||||
*/
|
|
||||||
function normalize(name: string): string {
|
|
||||||
return name
|
|
||||||
.replace(/[\u2122\u00ae\u00a9™®©]/g, "")
|
|
||||||
.replace(/^the\s+/i, "")
|
|
||||||
.replace(/[^\w\s]/g, " ")
|
|
||||||
.replace(/\s+/g, " ")
|
|
||||||
.toLowerCase()
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if a Queue-Times ride name matches any coaster in the RCDB set.
|
|
||||||
*
|
|
||||||
* Exact normalized match covers most cases. Prefix matching handles cases
|
|
||||||
* where one source drops or adds a subtitle:
|
|
||||||
* "Apocalypse" (QT) vs "Apocalypse the Ride" (RCDB)
|
|
||||||
* "The New Revolution - Classic" (QT) vs "New Revolution" (RCDB)
|
|
||||||
*
|
|
||||||
* Minimum 5 chars on the shorter side prevents accidental short matches.
|
|
||||||
*/
|
|
||||||
function isCoaster(name: string, coasterSet: Set<string>): boolean {
|
|
||||||
const norm = normalize(name);
|
|
||||||
if (coasterSet.has(norm)) return true;
|
|
||||||
for (const c of coasterSet) {
|
|
||||||
const shorter = norm.length <= c.length ? norm : c;
|
|
||||||
const longer = norm.length <= c.length ? c : norm;
|
|
||||||
if (shorter.length >= 5 && longer.startsWith(shorter)) return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const HEADERS = {
|
const HEADERS = {
|
||||||
"User-Agent":
|
"User-Agent":
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
||||||
@@ -131,7 +93,7 @@ export async function fetchLiveRides(
|
|||||||
isOpen: r.is_open,
|
isOpen: r.is_open,
|
||||||
waitMinutes: r.wait_time ?? 0,
|
waitMinutes: r.wait_time ?? 0,
|
||||||
lastUpdated: r.last_updated,
|
lastUpdated: r.last_updated,
|
||||||
isCoaster: coasterNames ? isCoaster(r.name, coasterNames) : false,
|
isCoaster: coasterNames ? isCoasterMatch(r.name, coasterNames) : false,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -144,7 +106,7 @@ export async function fetchLiveRides(
|
|||||||
isOpen: r.is_open,
|
isOpen: r.is_open,
|
||||||
waitMinutes: r.wait_time ?? 0,
|
waitMinutes: r.wait_time ?? 0,
|
||||||
lastUpdated: r.last_updated,
|
lastUpdated: r.last_updated,
|
||||||
isCoaster: coasterNames ? isCoaster(r.name, coasterNames) : false,
|
isCoaster: coasterNames ? isCoasterMatch(r.name, coasterNames) : false,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,8 @@
|
|||||||
"scrape": "tsx scripts/scrape.ts",
|
"scrape": "tsx scripts/scrape.ts",
|
||||||
"scrape:force": "tsx scripts/scrape.ts --rescrape",
|
"scrape:force": "tsx scripts/scrape.ts --rescrape",
|
||||||
"discover": "tsx scripts/discover.ts",
|
"discover": "tsx scripts/discover.ts",
|
||||||
"debug": "tsx scripts/debug.ts"
|
"debug": "tsx scripts/debug.ts",
|
||||||
|
"test": "tsx --test tests/*.test.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"better-sqlite3": "^12.8.0",
|
"better-sqlite3": "^12.8.0",
|
||||||
|
|||||||
51
tests/coaster-matching.test.ts
Normal file
51
tests/coaster-matching.test.ts
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
/**
|
||||||
|
* Coaster name matching tests.
|
||||||
|
*
|
||||||
|
* Each entry is a real case found between Queue-Times and RCDB names.
|
||||||
|
* Add new cases here when fixing a mismatch or false positive.
|
||||||
|
*
|
||||||
|
* Run with: npm test
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { test } from "node:test";
|
||||||
|
import assert from "node:assert/strict";
|
||||||
|
import { isCoasterMatch, normalizeForMatch } from "../lib/coaster-match";
|
||||||
|
|
||||||
|
function set(...rcdbNames: string[]): Set<string> {
|
||||||
|
return new Set(rcdbNames.map(normalizeForMatch));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Should match ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const SHOULD_MATCH: [qtName: string, rcdbName: string, park: string][] = [
|
||||||
|
["BATMAN™ The Ride", "Batman The Ride", "Over Georgia / Magic Mountain"],
|
||||||
|
["THE RIDDLER Mindbender", "Riddler Mindbender", "Over Georgia"],
|
||||||
|
["THE RIDDLER™'s Revenge", "Riddler's Revenge", "Magic Mountain"],
|
||||||
|
["CATWOMAN™ Whip", "Catwoman's Whip", "New England"],
|
||||||
|
["SUPERMAN™: Ultimate Flight", "Superman - Ultimate Flight", "Over Georgia"],
|
||||||
|
["THE JOKER™ Funhouse Coaster", "Joker Funhouse Coaster", "Over Georgia"],
|
||||||
|
["The Great American Scream Machine", "Great American Scream Machine", "Over Georgia"],
|
||||||
|
["Apocalypse", "Apocalypse the Ride", "Magic Mountain"],
|
||||||
|
["The New Revolution - Classic", "New Revolution", "Magic Mountain"],
|
||||||
|
["SCREAM", "Scream!", "Magic Mountain"],
|
||||||
|
["BAT GIRL™: Coaster Chase", "Batgirl Coaster Chase", "Fiesta Texas"],
|
||||||
|
["THE JOKER™ 4D Free Fly Coaster", "Joker", "New England"],
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const [qt, rcdb, park] of SHOULD_MATCH) {
|
||||||
|
test(`match: "${qt}" = "${rcdb}" (${park})`, () => {
|
||||||
|
assert.ok(isCoasterMatch(qt, set(rcdb)), `Expected match`);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Should NOT match (false positives) ───────────────────────────────────────
|
||||||
|
|
||||||
|
const SHOULD_NOT_MATCH: [qtName: string, rcdbName: string, park: string][] = [
|
||||||
|
["Joker y Harley Quinn", "Joker", "Six Flags Mexico"],
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const [qt, rcdb, park] of SHOULD_NOT_MATCH) {
|
||||||
|
test(`no match: "${qt}" ≠ "${rcdb}" (${park})`, () => {
|
||||||
|
assert.ok(!isCoasterMatch(qt, set(rcdb)), `Expected no match`);
|
||||||
|
});
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user