name-suggestion-index/lib/matcher.js
Bryan Housel 1c4fc70e3b WIP Modernize
- switch to type: module
- replace all CJS require/module.exports with ES6 import/expor
2021-06-22 00:04:52 -04:00

530 lines
19 KiB
JavaScript

import whichPolygon from 'which-polygon';
import { simplify } from './simplify.js';
// This will not work in the browser :(
// We may be able to switch to `import`, but:
// - for node, with node with --experimental-json-modules https://stackoverflow.com/a/59758333/7620
// - for browser, both esbuild and rollup should be able to bundle a .json import
import fs from 'fs';
const matchGroups = JSON.parse(fs.readFileSync('./config/matchGroups.json', 'utf8')).matchGroups;
const genericWords = JSON.parse(fs.readFileSync('./config/genericWords.json', 'utf8')).genericWords;
const trees = JSON.parse(fs.readFileSync('./config/trees.json', 'utf8')).trees;
export function Matcher() {
// The `_matchIndex` is a specialized structure that allows us to quickly answer
// _"Given a [key/value tagpair, name, location], what canonical items (brands etc) can match it?"_
//
// The index contains all valid combinations of k/v tagpairs and names
// matchIndex:
// {
// 'k/v': {
// 'primary': Map (String 'nsimple' -> Set (itemIDs…), // matches for tags like `name`, `name:xx`, etc.
// 'alternate': Map (String 'nsimple' -> Set (itemIDs…), // matches for tags like `alt_name`, `brand`, etc.
// 'excludeNamed': Map (String 'pattern' -> RegExp),
// 'excludeGeneric': Map (String 'pattern' -> RegExp)
// },
// }
//
// {
// 'amenity/bank': {
// 'primary': {
// 'firstbank': Set ("firstbank-978cca", "firstbank-9794e6", "firstbank-f17495", …),
// …
// },
// 'alternate': {
// '1stbank': Set ("firstbank-f17495"),
// …
// }
// },
// 'shop/supermarket': {
// 'primary': {
// 'coop': Set ("coop-76454b", "coop-ebf2d9", "coop-36e991", …),
// 'coopfood': Set ("coopfood-a8278b", …),
// …
// },
// 'alternate': {
// 'coop': Set ("coopfood-a8278b", …),
// 'federatedcooperatives': Set ("coop-76454b", …),
// 'thecooperative': Set ("coopfood-a8278b", …),
// …
// }
// }
// }
//
let _matchIndex;
// The `_genericWords` structure matches the contents of genericWords.json to instantiated RegExp objects
// Map (String 'pattern' -> RegExp),
let _genericWords = new Map();
(genericWords || []).forEach(s => _genericWords.set(s, new RegExp(s, 'i')));
// The `_itemLocation` structure maps itemIDs to locationSetIDs:
// {
// 'firstbank-f17495': '+[first_bank_western_us.geojson]',
// 'firstbank-978cca': '+[first_bank_carolinas.geojson]',
// 'coop-76454b': '+[Q16]',
// 'coopfood-a8278b': '+[Q23666]',
// …
// }
let _itemLocation;
// The `_locationSets` structure maps locationSetIDs to *resolved* locationSets:
// {
// '+[first_bank_western_us.geojson]': GeoJSON {…},
// '+[first_bank_carolinas.geojson]': GeoJSON {…},
// '+[Q16]': GeoJSON {…},
// '+[Q23666]': GeoJSON {…},
// …
// }
let _locationSets;
// The _locationIndex is an instance of which-polygon spatial index for the locationSets.
let _locationIndex;
let _warnings = []; // array of match conflict pairs
let matcher = {};
//
// `buildMatchIndex()`
// Call this to prepare the matcher for use
//
// `data` needs to be an Object indexed on a 'tree/key/value' path.
// (e.g. cache filled by `fileTree.read` or data found in `dist/nsi.json`)
// {
// 'brands/amenity/bank': { properties: {}, items: [ {}, {}, … ] },
// 'brands/amenity/bar': { properties: {}, items: [ {}, {}, … ] },
// …
// }
//
matcher.buildMatchIndex = (data) => {
if (_matchIndex) return; // it was built already
_matchIndex = new Map();
Object.keys(data).forEach(tkv => {
const category = data[tkv];
const parts = tkv.split('/', 3); // tkv = "tree/key/value"
const t = parts[0];
const k = parts[1];
const v = parts[2];
const thiskv = `${k}/${v}`;
const tree = trees[t];
let branch = _matchIndex.get(thiskv);
if (!branch) {
branch = {
primary: new Map(),
alternate: new Map(),
excludeGeneric: new Map(),
excludeNamed: new Map()
};
_matchIndex.set(thiskv, branch);
}
// ADD EXCLUSIONS
const properties = category.properties || {};
const exclude = properties.exclude || {};
(exclude.generic || []).forEach(s => branch.excludeGeneric.set(s, new RegExp(s, 'i')));
(exclude.named || []).forEach(s => branch.excludeNamed.set(s, new RegExp(s, 'i')));
const excludeRegexes = [...branch.excludeGeneric.values(), ...branch.excludeNamed.values()];
// ADD ITEMS
let items = category.items;
if (!Array.isArray(items) || !items.length) return;
// Primary name patterns, match tags to take first
// e.g. `name`, `name:ru`
const primaryName = new RegExp(tree.nameTags.primary, 'i');
// Alternate name patterns, match tags to consider after primary
// e.g. `alt_name`, `short_name`, `brand`, `brand:ru`, etc..
const alternateName = new RegExp(tree.nameTags.alternate, 'i');
// There are a few exceptions to the name matching regexes.
// Usually a tag suffix contains a language code like `name:en`, `name:ru`
// but we want to exclude things like `operator:type`, `name:etymology`, etc..
const notName = /:(colou?r|type|forward|backward|left|right|etymology|pronunciation|wikipedia)$/i;
// For certain categories we do not want to match generic KV pairs like `building/yes` or `amenity/yes`
const skipGenericKV = skipGenericKVMatches(t, k, v);
// We will collect the generic KV pairs anyway (for the purpose of filtering them out of matchTags)
const genericKV = new Set([`${k}/yes`, `building/yes`]);
// Collect alternate tagpairs for this kv category from matchGroups.
// We might also pick up a few more generic KVs (like `shop/yes`)
const matchGroupKV = new Set();
Object.values(matchGroups).forEach(matchGroup => {
const inGroup = matchGroup.some(otherkv => otherkv === thiskv);
if (!inGroup) return;
matchGroup.forEach(otherkv => {
if (otherkv === thiskv) return; // skip self
matchGroupKV.add(otherkv);
const otherk = otherkv.split('/', 2)[0]; // we might pick up a `shop/yes`
genericKV.add(`${otherk}/yes`);
});
});
// For each item, insert all [key, value, name] combinations into the match index
items.forEach(item => {
if (!item.id) return;
// Automatically remove redundant `matchTags` - #3417
// (i.e. This kv is already covered by matchGroups, so it doesn't need to be in `item.matchTags`)
if (Array.isArray(item.matchTags) && item.matchTags.length) {
item.matchTags = item.matchTags
.filter(matchTag => !matchGroupKV.has(matchTag) && !genericKV.has(matchTag));
if (!item.matchTags.length) delete item.matchTags;
}
// key/value tagpairs to insert into the match index..
let kvTags = [`${thiskv}`]
.concat(item.matchTags || []);
if (!skipGenericKV) {
kvTags = kvTags
.concat(Array.from(genericKV)); // #3454 - match some generic tags
}
// Index all the namelike tag values
Object.keys(item.tags).forEach(osmkey => {
if (notName.test(osmkey)) return; // osmkey is not a namelike tag, skip
const osmvalue = item.tags[osmkey];
if (!osmvalue || excludeRegexes.some(regex => regex.test(osmvalue))) return; // osmvalue missing or excluded
if (primaryName.test(osmkey)) {
kvTags.forEach(kv => insertName('primary', kv, simplify(osmvalue), item.id));
} else if (alternateName.test(osmkey)) {
kvTags.forEach(kv => insertName('alternate', kv, simplify(osmvalue), item.id));
}
});
// Index `matchNames` after indexing all other names..
let keepMatchNames = new Set();
(item.matchNames || []).forEach(matchName => {
// If this matchname isn't already indexed, add it to the alternate index
const nsimple = simplify(matchName);
kvTags.forEach(kv => {
const branch = _matchIndex.get(kv);
const primaryLeaf = branch && branch.primary.get(nsimple);
const alternateLeaf = branch && branch.alternate.get(nsimple);
const inPrimary = primaryLeaf && primaryLeaf.has(item.id);
const inAlternate = alternateLeaf && alternateLeaf.has(item.id);
if (!inPrimary && !inAlternate) {
insertName('alternate', kv, nsimple, item.id);
keepMatchNames.add(matchName);
}
});
});
// Automatically remove redundant `matchNames` - #3417
// (i.e. This name got indexed some other way, so it doesn't need to be in `item.matchNames`)
if (keepMatchNames.size) {
item.matchNames = Array.from(keepMatchNames);
} else {
delete item.matchNames;
}
}); // each item
}); // each tkv
// Insert this item into the matchIndex
function insertName(which, kv, nsimple, itemID) {
if (!nsimple) return;
let branch = _matchIndex.get(kv);
if (!branch) {
branch = {
primary: new Map(),
alternate: new Map(),
excludeGeneric: new Map(),
excludeNamed: new Map()
};
_matchIndex.set(kv, branch);
}
let leaf = branch[which].get(nsimple);
if (!leaf) {
leaf = new Set();
branch[which].set(nsimple, leaf);
}
leaf.add(itemID); // insert
}
// For certain categories we do not want to match generic KV pairs like `building/yes` or `amenity/yes`
function skipGenericKVMatches(t, k, v) {
return (
t === 'flags' ||
t === 'transit' ||
k === 'landuse' ||
v === 'atm' ||
v === 'bicycle_parking' ||
v === 'car_sharing' ||
v === 'caravan_site' ||
v === 'charging_station' ||
v === 'dog_park' ||
v === 'parking' ||
v === 'phone' ||
v === 'playground' ||
v === 'post_box' ||
v === 'public_bookcase' ||
v === 'recycling' ||
v === 'vending_machine'
);
}
};
//
// `buildLocationIndex()`
// Call this to prepare a which-polygon location index.
// This *resolves* all the locationSets into GeoJSON, which takes some time.
// You can skip this step if you don't care about matching within a location.
//
// `data` needs to be an Object indexed on a 'tree/key/value' path.
// (e.g. cache filled by `fileTree.read` or data found in `dist/nsi.json`)
// {
// 'brands/amenity/bank': { properties: {}, items: [ {}, {}, … ] },
// 'brands/amenity/bar': { properties: {}, items: [ {}, {}, … ] },
// …
// }
//
matcher.buildLocationIndex = (data, loco) => {
if (_locationIndex) return; // it was built already
_itemLocation = new Map();
_locationSets = new Map();
Object.keys(data).forEach(tkv => {
const items = data[tkv].items;
if (!Array.isArray(items) || !items.length) return;
items.forEach(item => {
if (_itemLocation.has(item.id)) return; // we've seen item id already - shouldn't be possible?
let resolved;
try {
resolved = loco.resolveLocationSet(item.locationSet); // resolve a feature for this locationSet
} catch (err) {
console.warn(`buildLocationIndex: ${err.message}`); // couldn't resolve
}
if (!resolved || !resolved.id) return;
_itemLocation.set(item.id, resolved.id); // link it to the item
if (_locationSets.has(resolved.id)) return; // we've seen this locationSet feature before..
// First time seeing this locationSet feature, make a copy and add to locationSet cache..
let feature = _cloneDeep(resolved.feature);
feature.id = resolved.id; // Important: always use the locationSet `id` (`+[Q30]`), not the feature `id` (`Q30`)
feature.properties.id = resolved.id;
if (!feature.geometry.coordinates.length || !feature.properties.area) {
console.warn(`buildLocationIndex: locationSet ${resolved.id} for ${item.id} resolves to an empty feature:`);
console.warn(JSON.stringify(feature));
return;
}
_locationSets.set(resolved.id, feature);
});
});
_locationIndex = whichPolygon({ type: 'FeatureCollection', features: [..._locationSets.values()] });
function _cloneDeep(obj) {
return JSON.parse(JSON.stringify(obj));
}
};
//
// `match()`
// Pass parts and return an Array of matches.
// `k` - key
// `v` - value
// `n` - namelike
// `loc` - optional - [lon,lat] location to search
//
// 1. If the [k,v,n] tuple matches a canonical item…
// Return an Array of match results.
// Each result will include the area in km² that the item is valid.
//
// Order of results:
// Primary ordering will be on the "match" column:
// "primary" - where the query matches the `name` tag, followed by
// "alternate" - where the query matches an alternate name tag (e.g. short_name, brand, operator, etc)
// Secondary ordering will be on the "area" column:
// "area descending" if no location was provided, (worldwide before local)
// "area ascending" if location was provided (local before worldwide)
//
// [
// { match: 'primary', itemID: String, area: Number, kv: String, nsimple: String },
// { match: 'primary', itemID: String, area: Number, kv: String, nsimple: String },
// { match: 'alternate', itemID: String, area: Number, kv: String, nsimple: String },
// { match: 'alternate', itemID: String, area: Number, kv: String, nsimple: String },
// …
// ]
//
// -or-
//
// 2. If the [k,v,n] tuple matches an exclude pattern…
// Return an Array with a single exclude result, either
//
// [ { match: 'excludeGeneric', pattern: String, kv: String } ] // "generic" e.g. "Food Court"
// or
// [ { match: 'excludeNamed', pattern: String, kv: String } ] // "named", e.g. "Kebabai"
//
// About results
// "generic" - a generic word that is probably not really a name.
// For these, iD should warn the user "Hey don't put 'food court' in the name tag".
// "named" - a real name like "Kebabai" that is just common, but not a brand.
// For these, iD should just let it be. We don't include these in NSI, but we don't want to nag users about it either.
//
// -or-
//
// 3. If the [k,v,n] tuple matches nothing of any kind, return `null`
//
//
matcher.match = (k, v, n, loc) => {
if (!_matchIndex) {
throw new Error('match: matchIndex not built.');
}
// If we were supplied a location, and a _locationIndex has been set up,
// get the locationSets that are valid there so we can filter results.
let matchLocations;
if (Array.isArray(loc) && _locationIndex) {
// which-polygon query returns an array of GeoJSON properties, pass true to return all results
matchLocations = _locationIndex([loc[0], loc[1], loc[0], loc[1]], true);
}
const nsimple = simplify(n);
let seen = new Set();
let results = [];
gatherResults('primary');
gatherResults('alternate');
if (results.length) return results;
gatherResults('exclude');
return results.length ? results : null;
function gatherResults(which) {
// First try an exact match on k/v
const kv = `${k}/${v}`;
let didMatch = tryMatch(which, kv);
if (didMatch) return;
// If that didn't work, look in match groups for other pairs considered equivalent to k/v..
for (let mg in matchGroups) {
const matchGroup = matchGroups[mg];
const inGroup = matchGroup.some(otherkv => otherkv === kv);
if (!inGroup) continue;
for (let i = 0; i < matchGroup.length; i++) {
const otherkv = matchGroup[i];
if (otherkv === kv) continue; // skip self
didMatch = tryMatch(which, otherkv);
if (didMatch) return;
}
}
// If finished 'exclude' pass and still haven't matched anything, try the global `genericWords.json` patterns
if (which === 'exclude') {
const regex = [..._genericWords.values()].find(regex => regex.test(n));
if (regex) {
results.push({ match: 'excludeGeneric', pattern: String(regex) }); // note no `branch`, no `kv`
return;
}
}
}
function tryMatch(which, kv) {
const branch = _matchIndex.get(kv);
if (!branch) return;
if (which === 'exclude') { // Test name `n` against named and generic exclude patterns
let regex = [...branch.excludeNamed.values()].find(regex => regex.test(n));
if (regex) {
results.push({ match: 'excludeNamed', pattern: String(regex), kv: kv });
return;
}
regex = [...branch.excludeGeneric.values()].find(regex => regex.test(n));
if (regex) {
results.push({ match: 'excludeGeneric', pattern: String(regex), kv: kv });
return;
}
return;
}
const leaf = branch[which].get(nsimple);
if (!leaf || !leaf.size) return;
// If we get here, we matched something..
// Prepare the results, calculate areas (if location index was set up)
let hits = Array.from(leaf).map(itemID => {
let area = Infinity;
if (_itemLocation && _locationSets) {
const location = _locationSets.get(_itemLocation.get(itemID));
area = (location && location.properties.area) || Infinity;
}
return { match: which, itemID: itemID, area: area, kv: kv, nsimple: nsimple };
});
let sortFn = byAreaDescending;
// Filter the match to include only results valid in the requested `loc`..
if (matchLocations) {
hits = hits.filter(isValidLocation);
sortFn = byAreaAscending;
}
if (!hits.length) return;
// push results
hits.sort(sortFn).forEach(hit => {
if (seen.has(hit.itemID)) return;
seen.add(hit.itemID);
results.push(hit);
});
return true;
function isValidLocation(hit) {
if (!_itemLocation) return true;
return matchLocations.find(props => props.id === _itemLocation.get(hit.itemID));
}
// Sort smaller (more local) locations first.
function byAreaAscending(hitA, hitB) {
return hitA.area - hitB.area;
}
// Sort larger (more worldwide) locations first.
function byAreaDescending(hitA, hitB) {
return hitB.area - hitA.area;
}
}
};
//
// `getWarnings()`
// Return any warnings discovered when buiding the index.
// (currently this does nothing)
//
matcher.getWarnings = () => _warnings;
return matcher;
}