783 lines
29 KiB
JavaScript
783 lines
29 KiB
JavaScript
// External
|
||
import chalk from 'chalk';
|
||
import fs from 'node:fs';
|
||
import JSON5 from 'json5';
|
||
import localeCompare from 'locale-compare';
|
||
import LocationConflation from '@rapideditor/location-conflation';
|
||
import safeRegex from 'safe-regex';
|
||
import stringify from '@aitodotai/json-stringify-pretty-compact';
|
||
const withLocale = localeCompare('en-US');
|
||
|
||
// Internal
|
||
import { fileTree } from '../lib/file_tree.js';
|
||
import { idgen } from '../lib/idgen.js';
|
||
import { Matcher } from '../lib/matcher.js';
|
||
import { simplify } from '../lib/simplify.js';
|
||
import { sortObject } from '../lib/sort_object.js';
|
||
import { stemmer } from '../lib/stemmer.js';
|
||
import { validate } from '../lib/validate.js';
|
||
import { writeFileWithMeta } from '../lib/write_file_with_meta.js';
|
||
const matcher = new Matcher();
|
||
|
||
// JSON
|
||
import treesJSON from '../config/trees.json' assert {type: 'json'};
|
||
const trees = treesJSON.trees;
|
||
|
||
// We use LocationConflation for validating and processing the locationSets
|
||
import featureCollectionJSON from '../dist/featureCollection.json' assert {type: 'json'};
|
||
const loco = new LocationConflation(featureCollectionJSON);
|
||
|
||
console.log(chalk.blue('-'.repeat(70)));
|
||
console.log(chalk.blue('🗂 Build index'));
|
||
console.log(chalk.blue('-'.repeat(70)));
|
||
|
||
let _config = {};
|
||
loadConfig();
|
||
|
||
let _cache = {};
|
||
loadIndex();
|
||
|
||
checkItems('brands');
|
||
checkItems('flags');
|
||
checkItems('operators');
|
||
checkItems('transit');
|
||
|
||
let _currCollectionDate = 0;
|
||
let _collected = {};
|
||
let _discard = {};
|
||
let _keep = {};
|
||
loadCollected();
|
||
filterCollected();
|
||
|
||
mergeItems();
|
||
|
||
saveIndex();
|
||
console.log('');
|
||
|
||
|
||
|
||
//
|
||
// Load, validate, cleanup config files
|
||
//
|
||
function loadConfig() {
|
||
['trees', 'replacements', 'genericWords'].forEach(which => {
|
||
const schema = JSON5.parse(fs.readFileSync(`./schema/${which}.json`, 'utf8'));
|
||
const file = `config/${which}.json`;
|
||
const contents = fs.readFileSync(file, 'utf8');
|
||
let data;
|
||
try {
|
||
data = JSON5.parse(contents);
|
||
} catch (jsonParseError) {
|
||
console.error(chalk.red(`Error - ${jsonParseError.message} reading:`));
|
||
console.error(' ' + chalk.yellow(file));
|
||
process.exit(1);
|
||
}
|
||
|
||
// check JSON schema
|
||
validate(file, data, schema);
|
||
|
||
// check regexes
|
||
if (which === 'trees') {
|
||
Object.values(data.trees).forEach(tree => {
|
||
checkRegex(file, tree.nameTags.primary);
|
||
checkRegex(file, tree.nameTags.alternate);
|
||
});
|
||
|
||
} else if (which === 'genericWords') {
|
||
Object.values(data.genericWords).forEach(pattern => checkRegex(file, pattern));
|
||
}
|
||
|
||
// Clean and sort the files for consistency, save them that way.
|
||
if (which === 'trees') {
|
||
Object.keys(data.trees).forEach(t => {
|
||
let tree = data.trees[t];
|
||
let cleaned = {
|
||
emoji: tree.emoji,
|
||
mainTag: tree.mainTag,
|
||
sourceTag: tree.sourceTag,
|
||
nameTags: {
|
||
primary: tree.nameTags.primary,
|
||
alternate: tree.nameTags.alternate,
|
||
}
|
||
};
|
||
tree = cleaned;
|
||
});
|
||
data.trees = sortObject(data.trees);
|
||
|
||
} else if (which === 'replacements') {
|
||
Object.keys(data.replacements).forEach(qid => {
|
||
let replacement = data.replacements[qid];
|
||
let cleaned = {
|
||
note: replacement.note,
|
||
wikidata: replacement.wikidata
|
||
};
|
||
replacement = cleaned;
|
||
});
|
||
data.replacements = sortObject(data.replacements);
|
||
|
||
} else if (which === 'genericWords') {
|
||
data.genericWords = data.genericWords.map(s => s.toLowerCase()).sort(withLocale);
|
||
}
|
||
|
||
// Lowercase and sort the files for consistency, save them that way.
|
||
fs.writeFileSync(file, stringify(data) + '\n');
|
||
|
||
_config[which] = data[which];
|
||
});
|
||
|
||
}
|
||
|
||
|
||
// check for potentially unsafe regular expressions:
|
||
// https://stackoverflow.com/a/43872595
|
||
function checkRegex(fileName, pattern) {
|
||
if (!safeRegex(pattern)) {
|
||
console.error(chalk.red('\nError - Potentially unsafe regular expression:'));
|
||
console.error(' ' + chalk.yellow(fileName + ': ' + pattern));
|
||
process.exit(1);
|
||
}
|
||
}
|
||
|
||
//
|
||
// Load the version number and the lists of tags collected from:
|
||
// https://github.com/ideditor/nsi-collector
|
||
//
|
||
function loadCollected() {
|
||
try {
|
||
const file = `./node_modules/@ideditor/nsi-collector/package.json`;
|
||
const contents = fs.readFileSync(file, 'utf8');
|
||
const collectorJSON = JSON5.parse(contents);
|
||
const rawVersion = collectorJSON.version;
|
||
const matched = rawVersion.match(/[~^]?\d+\.\d+\.(\d+)/);
|
||
if (matched) {
|
||
_currCollectionDate = +matched[1];
|
||
}
|
||
} catch (err) {
|
||
console.error(chalk.yellow(`Warning - ${err.message} reading 'nsi-collector/package.json'`));
|
||
}
|
||
|
||
['name', 'brand', 'operator', 'network'].forEach(tag => {
|
||
const file = `./node_modules/@ideditor/nsi-collector/dist/osm/${tag}s_all.json`;
|
||
const contents = fs.readFileSync(file, 'utf8');
|
||
let data;
|
||
try {
|
||
data = JSON5.parse(contents);
|
||
} catch (jsonParseError) {
|
||
console.error(chalk.red(`Error - ${jsonParseError.message} reading:`));
|
||
console.error(' ' + chalk.yellow(file));
|
||
process.exit(1);
|
||
}
|
||
|
||
_collected[tag] = data;
|
||
});
|
||
}
|
||
|
||
|
||
//
|
||
// Filter the tags collected into _keep and _discard lists
|
||
//
|
||
function filterCollected() {
|
||
const START = '🏗 ' + chalk.yellow(`Filtering values collected from OSM...`);
|
||
const END = '👍 ' + chalk.green(`done filtering`);
|
||
console.log('');
|
||
console.log(START);
|
||
console.time(END);
|
||
let shownSparkle = false;
|
||
|
||
// Before starting, cache genericWords regexes.
|
||
let genericRegex = _config.genericWords.map(s => new RegExp(s, 'i'));
|
||
genericRegex.push(new RegExp(/;/, 'i')); // also discard values with semicolons
|
||
|
||
|
||
Object.keys(_config.trees).forEach(t => {
|
||
const tree = _config.trees[t];
|
||
if (!Array.isArray(tree.sourceTags) || !tree.sourceTags.length) return;
|
||
|
||
let discard = _discard[t] = {};
|
||
let keep = _keep[t] = {};
|
||
let lastCollectionDate = -1;
|
||
let contents, data;
|
||
|
||
try { // Load existing "keep" file
|
||
contents = fs.readFileSync(`dist/filtered/${t}_keep.json`, 'utf8');
|
||
data = JSON5.parse(contents);
|
||
lastCollectionDate = +(data._meta.collectionDate) || -1;
|
||
keep = _keep[t] = data.keep;
|
||
} catch (err) {
|
||
/* ignore - we can overwrite the keep file */
|
||
}
|
||
|
||
// Exit here if:
|
||
// 1. we have data in `keep`, and..
|
||
// 2. that data is fresh (newer or same as installed nsi-collector dependency) - #5519
|
||
// (comment out this next line to force replace the keep/discard lists)
|
||
if (Object.keys(keep).length && lastCollectionDate >= _currCollectionDate) return;
|
||
|
||
// Continue, do filtering, and replace keep/discard lists..
|
||
if (!shownSparkle) {
|
||
console.log(chalk.yellow(`✨ New nsi-collector version ${_currCollectionDate} (was ${lastCollectionDate}). Updating filter lists:`));
|
||
shownSparkle = true;
|
||
}
|
||
|
||
//
|
||
// STEP 1: All the collected "names" from OSM start out in `discard`
|
||
//
|
||
keep = {};
|
||
tree.sourceTags.forEach(tag => {
|
||
let collected = _collected[tag];
|
||
for (const kvn in collected) {
|
||
discard[kvn] = Math.max((discard[kvn] || 0), collected[kvn]);
|
||
}
|
||
});
|
||
|
||
//
|
||
// STEP 2: Move "names" that aren't excluded from `discard` -> `keep`
|
||
//
|
||
let categoryRegex = {}; // regex cache
|
||
for (const kvn in discard) {
|
||
const [kv, n] = kvn.split('|', 2); // kvn = "key/value|name"
|
||
const tkv = `${t}/${kv}`;
|
||
const file = `./data/${tkv}.json`;
|
||
const category = _cache.path[tkv];
|
||
if (!category) continue; // not a category we track in the index, skip
|
||
|
||
const categoryProps = category.properties || {};
|
||
if (categoryProps.skipCollection) continue; // not a category where we want to collect new names, skip
|
||
|
||
if (!categoryRegex[tkv]) {
|
||
const exclude = categoryProps.exclude || {};
|
||
const excludePatterns = (exclude.generic || []).concat((exclude.named || []));
|
||
categoryRegex[tkv] = excludePatterns.map(s => checkRegex(file, s) || new RegExp(s, 'i'));
|
||
}
|
||
const isExcluded = categoryRegex[tkv].some(re => re.test(n)) || genericRegex.some(re => re.test(n));
|
||
if (!isExcluded) {
|
||
keep[kvn] = discard[kvn];
|
||
delete discard[kvn];
|
||
}
|
||
}
|
||
|
||
const discardCount = Object.keys(discard).length;
|
||
const keepCount = Object.keys(keep).length;
|
||
console.log(`${tree.emoji} ${t}:\t${keepCount} keep, ${discardCount} discard`);
|
||
|
||
let stringified;
|
||
const meta = { collectionDate: _currCollectionDate.toString(10) };
|
||
|
||
stringified = stringify({ discard: sortObject(discard) }) + '\n';
|
||
writeFileWithMeta(`dist/filtered/${t}_discard.json`, stringified, meta);
|
||
|
||
stringified = stringify({ keep: sortObject(keep) }) + '\n';
|
||
writeFileWithMeta(`dist/filtered/${t}_keep.json`, stringified, meta);
|
||
});
|
||
|
||
console.timeEnd(END);
|
||
}
|
||
|
||
|
||
//
|
||
// Load the index files under `data/*`
|
||
//
|
||
function loadIndex() {
|
||
const START = '🏗 ' + chalk.yellow(`Loading index files...`);
|
||
const END = '👍 ' + chalk.green(`done loading`);
|
||
console.log('');
|
||
console.log(START);
|
||
console.time(END);
|
||
|
||
fileTree.read(_cache, loco);
|
||
fileTree.expandTemplates(_cache, loco);
|
||
console.timeEnd(END);
|
||
|
||
const MATCH_INDEX_END = '👍 ' + chalk.green(`built match index`);
|
||
console.time(MATCH_INDEX_END);
|
||
matcher.buildMatchIndex(_cache.path);
|
||
console.timeEnd(MATCH_INDEX_END);
|
||
|
||
let warnMatched = matcher.getWarnings();
|
||
if (warnMatched.length) {
|
||
console.warn(chalk.yellow('\n⚠️ Warning - matchIndex errors:'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
console.warn(chalk.gray(' `key/value/name` occurs multiple times in the match index.'));
|
||
console.warn(chalk.gray(' To resolve these, make sure the key/value/name does not appear in multiple trees'));
|
||
console.warn(chalk.gray(' (e.g. `amenity/post_office/ups` should not be both a "brand" and an "operator"'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
warnMatched.forEach(w => console.warn(chalk.yellow(w)));
|
||
console.warn('total ' + warnMatched.length);
|
||
}
|
||
|
||
|
||
|
||
// It takes a few seconds to resolve all of the locationSets into GeoJSON and insert into which-polygon
|
||
// We don't need a location index for this script, but it's useful to know.
|
||
const LOCATION_INDEX_END = '👍 ' + chalk.green(`built location index`);
|
||
console.time(LOCATION_INDEX_END);
|
||
matcher.buildLocationIndex(_cache.path, loco);
|
||
console.timeEnd(LOCATION_INDEX_END);
|
||
}
|
||
|
||
|
||
//
|
||
// Save the updated index files under `data/*`
|
||
//
|
||
function saveIndex() {
|
||
const START = '🏗 ' + chalk.yellow(`Saving index files...`);
|
||
const END = '👍 ' + chalk.green(`done saving`);
|
||
console.log('');
|
||
console.log(START);
|
||
console.time(END);
|
||
|
||
fileTree.write(_cache);
|
||
console.timeEnd(END);
|
||
}
|
||
|
||
|
||
//
|
||
// mergeItems()
|
||
// Iterate over the names we are keeping and:
|
||
// - insert anything "new" (i.e. not matched by the matcher).
|
||
// - update all items to have whatever tags they should have.
|
||
//
|
||
function mergeItems() {
|
||
// Any country codes which should be replaced by more standard ones in the locationSets
|
||
const countryReplacements = {
|
||
'uk': 'gb', // Exceptionally reserved, United Kingdom is officially assigned the alpha-2 code GB
|
||
}
|
||
|
||
const START = '🏗 ' + chalk.yellow(`Merging items...`);
|
||
const END = '👍 ' + chalk.green(`done merging`);
|
||
console.log('');
|
||
console.log(START);
|
||
console.time(END);
|
||
|
||
|
||
Object.keys(_config.trees).forEach(t => {
|
||
const tree = _config.trees[t];
|
||
let total = 0;
|
||
let totalNew = 0;
|
||
let newItems = {};
|
||
|
||
//
|
||
// INSERT - Look in `_keep` for new items not yet in the index..
|
||
//
|
||
const keeping = _keep[t] || {};
|
||
|
||
// Find new items, keeping only the most popular spelling..
|
||
Object.keys(keeping).forEach(kvn => {
|
||
const count = keeping[kvn];
|
||
const [kv, n] = kvn.split('|', 2); // kvn = "key/value|name"
|
||
const [k, v] = kv.split('/', 2);
|
||
|
||
const matched = matcher.match(k, v, n);
|
||
if (matched) return; // already in the index (or generic)
|
||
|
||
// Use the simplified name when comparing spelling popularity
|
||
const nsimple = simplify(n);
|
||
if (!nsimple) return; // invalid, or the name contains only punctuation?
|
||
const newid = `${k}/${v}|${nsimple}`;
|
||
const otherNew = newItems[newid];
|
||
|
||
// Seen for the first time, or this name is a more popular spelling
|
||
if (!otherNew || otherNew.count < count) {
|
||
newItems[newid] = { kvn: kvn, count: count };
|
||
}
|
||
});
|
||
|
||
// Add the new items
|
||
Object.values(newItems).forEach(newItem => {
|
||
const [kv, n] = newItem.kvn.split('|', 2); // kvn = "key/value|name"
|
||
const [k, v] = kv.split('/', 2);
|
||
const tkv = `${t}/${k}/${v}`;
|
||
|
||
let item = { tags: {} };
|
||
item.displayName = n;
|
||
item.locationSet = { include: ['001'] }; // the whole world
|
||
item.tags[k] = v; // assign default tag k=v
|
||
|
||
// Perform tree-specific tag defaults here..
|
||
if (t === 'brands') {
|
||
item.tags.brand = n;
|
||
item.tags.name = n;
|
||
|
||
} else if (t === 'operators') {
|
||
item.tags.operator = n;
|
||
|
||
} else if (t === 'transit') {
|
||
item.tags.network = n;
|
||
}
|
||
|
||
// Insert into index..
|
||
if (!_cache.path[tkv]) {
|
||
_cache.path[tkv] = { properties: { path: tkv }, items: [], templates: [] };
|
||
}
|
||
|
||
_cache.path[tkv].items.push(item);
|
||
totalNew++;
|
||
});
|
||
|
||
|
||
//
|
||
// UPDATE - Check all items in the tree for expected tags..
|
||
//
|
||
const paths = Object.keys(_cache.path).filter(tkv => tkv.split('/')[0] === t);
|
||
paths.forEach(tkv => {
|
||
let items = _cache.path[tkv].items;
|
||
if (!Array.isArray(items) || !items.length) return;
|
||
|
||
const [t, k, v] = tkv.split('/', 3); // tkv = "tree/key/value"
|
||
const kv = `${k}/${v}`;
|
||
|
||
items.forEach(item => {
|
||
total++;
|
||
let tags = item.tags;
|
||
let name = ''; // which "name" we use for the locales check below
|
||
|
||
// assign some default companion tags if missing
|
||
if (kv === 'amenity/cafe') {
|
||
if (!tags.takeaway) tags.takeaway = 'yes';
|
||
if (!tags.cuisine) tags.cuisine = 'coffee_shop';
|
||
} else if (kv === 'amenity/fast_food') {
|
||
if (!tags.takeaway) tags.takeaway = 'yes';
|
||
} else if (kv === 'amenity/clinic') {
|
||
if (!tags.healthcare) tags.healthcare = 'clinic';
|
||
} else if (kv === 'amenity/dentist') {
|
||
if (!tags.healthcare) tags.healthcare = 'dentist';
|
||
} else if (kv === 'amenity/doctors') {
|
||
if (!tags.healthcare) tags.healthcare = 'doctor';
|
||
} else if (kv === 'amenity/hospital') {
|
||
if (!tags.healthcare) tags.healthcare = 'hospital';
|
||
} else if (kv === 'amenity/pharmacy') {
|
||
if (!tags.healthcare) tags.healthcare = 'pharmacy';
|
||
}
|
||
|
||
// Perform tree-specific tag cleanups here..
|
||
if (t === 'brands') {
|
||
name = tags.brand || tags.name;
|
||
|
||
} else if (t === 'flags') {
|
||
name = tags['flag:name'];
|
||
|
||
// Sort the flags in the file according to their country of origin
|
||
let country = tags.country || item.locationSet.include[0];
|
||
if (typeof country === 'string' && country.length === 2) {
|
||
const cc = country.toUpperCase();
|
||
const re = new RegExp('^' + cc); // leading country code
|
||
if (!re.test(item.displayName)) {
|
||
item.displayName = cc + ' - ' + item.displayName;
|
||
}
|
||
}
|
||
|
||
} else if (t === 'operators') {
|
||
name = tags.operator || tags.name || tags.brand;
|
||
|
||
// Seed missing operator tags (for a file that we copied over from the 'brand' tree)
|
||
Object.keys(tags).forEach(osmkey => {
|
||
if (/brand/.test(osmkey)) {
|
||
const brandkey = osmkey;
|
||
const operatorkey = brandkey.replace('brand', 'operator'); // `brand`->`operator`, `brand:ru`->`operator:ru`, etc.
|
||
if (!tags[operatorkey]) {
|
||
tags[operatorkey] = tags[brandkey];
|
||
}
|
||
}
|
||
});
|
||
|
||
} else if (t === 'transit') {
|
||
name = tags.network;
|
||
}
|
||
|
||
// If the name can only be reasonably read in one country,
|
||
// assign `locationSet`, and localize tags like `name:xx`
|
||
// https://www.regular-expressions.info/unicode.html
|
||
if (/[\u0590-\u05FF]/.test(name)) { // Hebrew
|
||
// note: old ISO 639-1 lang code for Hebrew was `iw`, now `he`
|
||
if (!item.locationSet) item.locationSet = { include: ['iw'] };
|
||
setLanguageTags(tags, 'he');
|
||
} else if (/[\u0E00-\u0E7F]/.test(name)) { // Thai
|
||
if (!item.locationSet) item.locationSet = { include: ['th'] };
|
||
setLanguageTags(tags, 'th');
|
||
} else if (/[\u1000-\u109F]/.test(name)) { // Myanmar
|
||
if (!item.locationSet) item.locationSet = { include: ['mm'] };
|
||
setLanguageTags(tags, 'my');
|
||
} else if (/[\u1100-\u11FF]/.test(name)) { // Hangul
|
||
if (!item.locationSet) item.locationSet = { include: ['kr'] };
|
||
setLanguageTags(tags, 'ko');
|
||
} else if (/[\u1700-\u171F]/.test(name)) { // Tagalog
|
||
if (!item.locationSet) item.locationSet = { include: ['ph'] };
|
||
setLanguageTags(tags, 'tl');
|
||
} else if (/[\u3040-\u30FF]/.test(name)) { // Hirgana or Katakana
|
||
if (!item.locationSet) item.locationSet = { include: ['jp'] };
|
||
setLanguageTags(tags, 'ja');
|
||
} else if (/[\u3130-\u318F]/.test(name)) { // Hangul
|
||
if (!item.locationSet) item.locationSet = { include: ['kr'] };
|
||
setLanguageTags(tags, 'ko');
|
||
} else if (/[\uA960-\uA97F]/.test(name)) { // Hangul
|
||
if (!item.locationSet) item.locationSet = { include: ['kr'] };
|
||
setLanguageTags(tags, 'ko');
|
||
} else if (/[\uAC00-\uD7AF]/.test(name)) { // Hangul
|
||
if (!item.locationSet) item.locationSet = { include: ['kr'] };
|
||
setLanguageTags(tags, 'ko');
|
||
} else {
|
||
if (!item.locationSet) item.locationSet = { include: ['001'] }; // the whole world
|
||
}
|
||
|
||
// Perform common tag cleanups here..
|
||
Object.keys(tags).forEach(osmkey => {
|
||
// Remove tags we're not including in this index
|
||
// anything ending in `website` or `wikipedia` - #5275, #6481
|
||
if (/(website|wikipedia)$/.test(osmkey)) {
|
||
delete tags[osmkey];
|
||
return;
|
||
}
|
||
|
||
// Perform Wikidata QID replacements
|
||
// anything ending in `wikidata`
|
||
if (/wikidata$/.test(osmkey)) {
|
||
const wd = tags[osmkey];
|
||
const replace = _config.replacements[wd]; // If it matches a QID in the replacement list...
|
||
|
||
if (replace && replace.wikidata !== undefined) { // replace or delete `*:wikidata` tag
|
||
if (replace.wikidata) {
|
||
tags[osmkey] = replace.wikidata;
|
||
} else {
|
||
delete tags[osmkey];
|
||
}
|
||
}
|
||
}
|
||
});
|
||
|
||
// Perform locationSet country code replacements
|
||
Object.keys(countryReplacements).forEach(country => {
|
||
[item.locationSet.include, item.locationSet.exclude].forEach(v => {
|
||
if (v) {
|
||
normalizeCountryCode(v, country);
|
||
}
|
||
});
|
||
});
|
||
|
||
// regenerate id here, in case the locationSet has changed
|
||
const locationID = loco.validateLocationSet(item.locationSet).id;
|
||
item.id = idgen(item, tkv, locationID);
|
||
});
|
||
});
|
||
|
||
console.log(`${tree.emoji} ${t}:\t${total} total, ${totalNew} new`);
|
||
|
||
});
|
||
|
||
console.timeEnd(END);
|
||
|
||
|
||
// Copy main tag value to local tag value, but only if local value not assigned yet
|
||
// re: 6788#issuecomment-1188024213
|
||
function setLanguageTags(tags, code) {
|
||
['name', 'brand', 'operator', 'network'].forEach(k => {
|
||
const v = tags[k];
|
||
const loc_k = `${k}:${code}`; // e.g. `name:ja`
|
||
const loc_v = tags[loc_k];
|
||
if (v && !loc_v) {
|
||
tags[loc_k] = v;
|
||
}
|
||
});
|
||
}
|
||
|
||
function normalizeCountryCode(countries, country) {
|
||
const index = countries.indexOf(country.toLowerCase())
|
||
if (index >= 0) {
|
||
const replace = countryReplacements[country.toLowerCase()];
|
||
if (replace && replace.country !== undefined) {
|
||
countries[index] = replace.country.toLowerCase()
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
//
|
||
// checkItems()
|
||
// Checks all the items for several kinds of issues
|
||
//
|
||
function checkItems(t) {
|
||
console.log('');
|
||
console.log('🏗 ' + chalk.yellow(`Checking ${t}...`));
|
||
|
||
const tree = _config.trees[t];
|
||
const oddChars = /[\s=!"#%'*{},.\/:?\(\)\[\]@\\$\^*+<>«»~`’\u00a1\u00a7\u00b6\u00b7\u00bf\u037e\u0387\u055a-\u055f\u0589\u05c0\u05c3\u05c6\u05f3\u05f4\u0609\u060a\u060c\u060d\u061b\u061e\u061f\u066a-\u066d\u06d4\u0700-\u070d\u07f7-\u07f9\u0830-\u083e\u085e\u0964\u0965\u0970\u0af0\u0df4\u0e4f\u0e5a\u0e5b\u0f04-\u0f12\u0f14\u0f85\u0fd0-\u0fd4\u0fd9\u0fda\u104a-\u104f\u10fb\u1360-\u1368\u166d\u166e\u16eb-\u16ed\u1735\u1736\u17d4-\u17d6\u17d8-\u17da\u1800-\u1805\u1807-\u180a\u1944\u1945\u1a1e\u1a1f\u1aa0-\u1aa6\u1aa8-\u1aad\u1b5a-\u1b60\u1bfc-\u1bff\u1c3b-\u1c3f\u1c7e\u1c7f\u1cc0-\u1cc7\u1cd3\u200b-\u200f\u2016\u2017\u2020-\u2027\u2030-\u2038\u203b-\u203e\u2041-\u2043\u2047-\u2051\u2053\u2055-\u205e\u2cf9-\u2cfc\u2cfe\u2cff\u2d70\u2e00\u2e01\u2e06-\u2e08\u2e0b\u2e0e-\u2e16\u2e18\u2e19\u2e1b\u2e1e\u2e1f\u2e2a-\u2e2e\u2e30-\u2e39\u3001-\u3003\u303d\u30fb\ua4fe\ua4ff\ua60d-\ua60f\ua673\ua67e\ua6f2-\ua6f7\ua874-\ua877\ua8ce\ua8cf\ua8f8-\ua8fa\ua92e\ua92f\ua95f\ua9c1-\ua9cd\ua9de\ua9df\uaa5c-\uaa5f\uaade\uaadf\uaaf0\uaaf1\uabeb\ufe10-\ufe16\ufe19\ufe30\ufe45\ufe46\ufe49-\ufe4c\ufe50-\ufe52\ufe54-\ufe57\ufe5f-\ufe61\ufe68\ufe6a\ufe6b\ufeff\uff01-\uff03\uff05-\uff07\uff0a\uff0c\uff0e\uff0f\uff1a\uff1b\uff1f\uff20\uff3c\uff61\uff64\uff65]+/g;
|
||
|
||
let warnDuplicate = [];
|
||
let warnFormatWikidata = [];
|
||
let warnMissingTag = [];
|
||
let warnFormatTag = [];
|
||
let seenName = {};
|
||
|
||
let total = 0; // total items
|
||
let totalWd = 0; // total items with wikidata
|
||
|
||
const paths = Object.keys(_cache.path).filter(tkv => tkv.split('/')[0] === t);
|
||
const display = (val) => `${val.displayName} (${val.id})`;
|
||
|
||
paths.forEach(tkv => {
|
||
const items = _cache.path[tkv].items;
|
||
if (!Array.isArray(items) || !items.length) return;
|
||
|
||
const [t, k, v] = tkv.split('/', 3); // tkv = "tree/key/value"
|
||
const kv = `${k}/${v}`;
|
||
|
||
items.forEach(item => {
|
||
const tags = item.tags;
|
||
|
||
total++;
|
||
if (tags[tree.mainTag]) totalWd++;
|
||
|
||
// check tags
|
||
Object.keys(tags).forEach(osmkey => {
|
||
if (/:wikidata$/.test(osmkey)) { // Check '*:wikidata' tags
|
||
const wd = tags[osmkey];
|
||
if (!/^Q\d+$/.test(wd)) {
|
||
warnFormatWikidata.push([display(item), wd]);
|
||
}
|
||
}
|
||
});
|
||
|
||
// Warn on other missing tags
|
||
switch (kv) {
|
||
case 'amenity/clinic':
|
||
case 'amenity/hospital':
|
||
case 'amenity/pharmacy':
|
||
if (!tags.healthcare) { warnMissingTag.push([display(item), 'healthcare']); }
|
||
break;
|
||
case 'amenity/gambling':
|
||
case 'leisure/adult_gaming_centre':
|
||
if (!tags.gambling) { warnMissingTag.push([display(item), 'gambling']); }
|
||
break;
|
||
case 'amenity/fast_food':
|
||
case 'amenity/restaurant':
|
||
if (!tags.cuisine) { warnMissingTag.push([display(item), 'cuisine']); }
|
||
break;
|
||
case 'amenity/training':
|
||
if (!tags.training) { warnMissingTag.push([display(item), 'training']); }
|
||
break;
|
||
case 'amenity/vending_machine':
|
||
if (!tags.vending) { warnMissingTag.push([display(item), 'vending']); }
|
||
break;
|
||
case 'man_made/flagpole':
|
||
if (!tags['flag:type']) { warnMissingTag.push([display(item), 'flag:type']); }
|
||
if (!/^wiphala/.test(item.id)) {
|
||
if (!tags['subject']) { warnMissingTag.push([display(item), 'subject']); }
|
||
if (!tags['subject:wikidata']) { warnMissingTag.push([display(item), 'subject:wikidata']); }
|
||
}
|
||
break;
|
||
case 'shop/beauty':
|
||
if (!tags.beauty) { warnMissingTag.push([display(item), 'beauty']); }
|
||
break;
|
||
}
|
||
|
||
// Warn if OSM tags contain odd punctuation or spacing..
|
||
['beauty', 'cuisine', 'flush:disposal', 'gambling', 'government', 'sport', 'training', 'vending'].forEach(osmkey => {
|
||
const val = tags[osmkey];
|
||
if (val && oddChars.test(val)) {
|
||
warnFormatTag.push([display(item), `${osmkey} = ${val}`]);
|
||
}
|
||
});
|
||
// Warn if a semicolon-delimited multivalue has snuck into the index
|
||
['name', 'brand', 'operator', 'network'].forEach(osmkey => {
|
||
const val = tags[osmkey];
|
||
if (val && /;/.test(val)) {
|
||
warnFormatTag.push([display(item), `${osmkey} = ${val}`]);
|
||
}
|
||
});
|
||
// Warn if user put `wikidata` instead of `brand:wikidata`
|
||
['wikidata'].forEach(osmkey => {
|
||
const val = tags[osmkey];
|
||
if (val) {
|
||
warnFormatTag.push([display(item), `${osmkey} = ${val}`]);
|
||
}
|
||
});
|
||
|
||
|
||
// TODO ?
|
||
// // Warn about "new" (no wikidata) items that may duplicate an "existing" (has wikidata) item.
|
||
// // The criteria for this warning is:
|
||
// // - One of the items has no `brand:wikidata`
|
||
// // - The items have nearly the same name
|
||
// // - The items have the same locationSet (or the one without wikidata is worldwide)
|
||
// const name = tags.name || tags.brand;
|
||
// const stem = stemmer(name) || name;
|
||
// const itemwd = tags[tree.mainTag];
|
||
// const itemls = loco.validateLocationSet(item.locationSet).id;
|
||
|
||
// if (!seenName[stem]) seenName[stem] = new Set();
|
||
// seenName[stem].add(item);
|
||
|
||
// if (seenName[stem].size > 1) {
|
||
// seenName[stem].forEach(other => {
|
||
// if (other.id === item.id) return; // skip self
|
||
// const otherwd = other.tags[tree.mainTag];
|
||
// const otherls = loco.validateLocationSet(other.locationSet).id;
|
||
|
||
// // pick one of the items without a wikidata tag to be the "duplicate"
|
||
// if (!itemwd && (itemls === otherls || itemls === '+[Q2]')) {
|
||
// warnDuplicate.push([display(item), display(other)]);
|
||
// } else if (!otherwd && (otherls === itemls || otherls === '+[Q2]')) {
|
||
// warnDuplicate.push([display(other), display(item)]);
|
||
// }
|
||
// });
|
||
// }
|
||
|
||
});
|
||
});
|
||
|
||
if (warnMissingTag.length) {
|
||
console.warn(chalk.yellow('\n⚠️ Warning - Missing tag:'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
console.warn(chalk.gray(' To resolve these, add the missing tag.'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
warnMissingTag.forEach(w => console.warn(
|
||
chalk.yellow(' "' + w[0] + '"') + ' -> missing tag? -> ' + chalk.yellow('"' + w[1] + '"')
|
||
));
|
||
console.warn('total ' + warnMissingTag.length);
|
||
}
|
||
|
||
if (warnFormatTag.length) {
|
||
console.warn(chalk.yellow('\n⚠️ Warning - Unusual OpenStreetMap tag:'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
console.warn(chalk.gray(' To resolve these, make sure the OpenStreetMap tag is correct.'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
warnFormatTag.forEach(w => console.warn(
|
||
chalk.yellow(' "' + w[0] + '"') + ' -> unusual tag? -> ' + chalk.yellow('"' + w[1] + '"')
|
||
));
|
||
console.warn('total ' + warnFormatTag.length);
|
||
}
|
||
|
||
if (warnDuplicate.length) {
|
||
console.warn(chalk.yellow('\n⚠️ Warning - Potential duplicate:'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
console.warn(chalk.gray(' If the items are two different businesses,'));
|
||
console.warn(chalk.gray(' make sure they both have accurate locationSets (e.g. "us"/"ca") and wikidata identifiers.'));
|
||
console.warn(chalk.gray(' If the items are duplicates of the same business,'));
|
||
console.warn(chalk.gray(' add `matchTags`/`matchNames` properties to the item that you want to keep, and delete the unwanted item.'));
|
||
console.warn(chalk.gray(' If the duplicate item is a generic word,'));
|
||
console.warn(chalk.gray(' add a filter to config/filter_brands.json and delete the unwanted item.'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
warnDuplicate.forEach(w => console.warn(
|
||
chalk.yellow(' "' + w[0] + '"') + ' -> duplicates? -> ' + chalk.yellow('"' + w[1] + '"')
|
||
));
|
||
console.warn('total ' + warnDuplicate.length);
|
||
}
|
||
|
||
if (warnFormatWikidata.length) {
|
||
console.warn(chalk.yellow('\n⚠️ Warning - Incorrect `wikidata` format:'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
console.warn(chalk.gray(' To resolve these, make sure "*:wikidata" tag looks like "Q191615".'));
|
||
console.warn(chalk.gray('-').repeat(70));
|
||
warnFormatWikidata.forEach(w => console.warn(
|
||
chalk.yellow(' "' + w[0] + '"') + ' -> "*:wikidata": ' + '"' + w[1] + '"'
|
||
));
|
||
console.warn('total ' + warnFormatWikidata.length);
|
||
}
|
||
|
||
const pctWd = total > 0 ? (totalWd * 100 / total).toFixed(1) : 0;
|
||
|
||
console.log('');
|
||
console.info(chalk.blue.bold(`${tree.emoji} ${t}/* completeness:`));
|
||
console.info(chalk.blue.bold(` ${total} total`));
|
||
console.info(chalk.blue.bold(` ${totalWd} (${pctWd}%) with a '${tree.mainTag}' tag`));
|
||
}
|