test: add coaster name matching test suite
Extract matching logic into lib/coaster-match.ts (isCoasterMatch + normalizeForMatch) so it can be imported by both the scraper and tests without duplication. Add tests/coaster-matching.test.ts covering all known match/false-positive cases: - Trademark symbols, leading THE, possessives, punctuation - Subtitle variants in both directions (Apocalypse, New Revolution - Classic) - Space-split brand words (BAT GIRL vs Batgirl) - 4D subtitle extension (THE JOKER™ 4D Free Fly Coaster vs Joker) - False positives: Joker y Harley Quinn, conjunction connectors Run with: npm test Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
64
lib/coaster-match.ts
Normal file
64
lib/coaster-match.ts
Normal file
@@ -0,0 +1,64 @@
|
||||
/**
|
||||
* Coaster name matching — shared between the Queue-Times scraper and tests.
|
||||
*
|
||||
* Queue-Times and RCDB use different name conventions:
|
||||
* - Trademark symbols (™ ® ©)
|
||||
* - Leading "THE " prefixes
|
||||
* - Possessives ("Catwoman's" vs "Catwoman")
|
||||
* - Subtitles added or dropped ("Apocalypse" vs "Apocalypse the Ride")
|
||||
* - Space-split brand words ("BAT GIRL" vs "Batgirl")
|
||||
* - Conjunction-joined compound rides ("Joker y Harley Quinn" ≠ "Joker")
|
||||
*/
|
||||
|
||||
// Words that join two ride names rather than extend one subtitle.
|
||||
// When a prefix match is found and the next word is one of these,
|
||||
// the longer name is a *different* ride, not a subtitle.
|
||||
const CONJUNCTIONS = new Set(["y", "and", "&", "with", "de", "del", "e", "et"]);
|
||||
|
||||
/**
|
||||
* Normalize a ride name for matching.
|
||||
* Both sides (Queue-Times and RCDB) must be normalized with this function
|
||||
* before any comparison so the transforms are symmetric.
|
||||
*/
|
||||
export function normalizeForMatch(name: string): string {
|
||||
return name
|
||||
.replace(/[\u2122\u00ae\u00a9™®©]/g, "") // strip ™ ® ©
|
||||
.replace(/^the\s+/i, "") // strip leading "THE "
|
||||
.replace(/['\u2019]s\b/gi, "") // strip possessives ('s / 's)
|
||||
.replace(/[^\w\s]/g, " ") // all remaining punctuation → space
|
||||
.replace(/\s+/g, " ")
|
||||
.toLowerCase()
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when the Queue-Times ride name matches an entry in the RCDB
|
||||
* coaster set (which must be built with normalizeForMatch).
|
||||
*
|
||||
* Matching strategy (in order):
|
||||
* 1. Exact normalized match.
|
||||
* 2. Compact (space-stripped) match — catches "BAT GIRL" vs "Batgirl".
|
||||
* 3. Prefix match — the shorter normalized name is a prefix of the longer,
|
||||
* minimum 5 chars, unless the next word after the prefix is a conjunction
|
||||
* (which signals a compound ride name, not a subtitle).
|
||||
*/
|
||||
export function isCoasterMatch(qtName: string, coasterSet: Set<string>): boolean {
|
||||
const norm = normalizeForMatch(qtName);
|
||||
if (coasterSet.has(norm)) return true;
|
||||
|
||||
const compact = norm.replace(/\s/g, "");
|
||||
for (const c of coasterSet) {
|
||||
// Compact comparison
|
||||
if (compact.length >= 5 && c.replace(/\s/g, "") === compact) return true;
|
||||
|
||||
// Prefix comparison
|
||||
const shorter = norm.length <= c.length ? norm : c;
|
||||
const longer = norm.length <= c.length ? c : norm;
|
||||
if (shorter.length >= 5 && longer.startsWith(shorter)) {
|
||||
const nextWord = longer.slice(shorter.length).trim().split(" ")[0];
|
||||
if (!CONJUNCTIONS.has(nextWord)) return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
Reference in New Issue
Block a user