Tag filter in cpp, better categories matching in conflate

This commit is contained in:
Ilya Zverev 2018-01-15 19:36:32 +03:00
parent 85ee8ea9f2
commit fdc4b242b0
3 changed files with 141 additions and 42 deletions

View file

@ -73,13 +73,14 @@ class OSMPoint(SourcePoint):
"""An OSM points is a SourcePoint with a few extra fields.
Namely, version, members (for ways and relations), and an action.
The id is compound and created from object type and object id."""
def __init__(self, ptype, pid, version, lat, lon, tags=None):
def __init__(self, ptype, pid, version, lat, lon, tags=None, categories=None):
super().__init__('{}{}'.format(ptype[0], pid), lat, lon, tags)
self.osm_type = ptype
self.osm_id = pid
self.version = version
self.members = None
self.action = None
self.categories = categories or set()
def copy(self):
"""Returns a copy of this object, except for members field."""
@ -207,27 +208,35 @@ class OsmConflator:
(k, v) turns into [k=v], (k,) into [k], (k, None) into [!k], (k, "~v") into [k~v]."""
tags = self.profile.get(
'query', required="a list of tuples. E.g. [('amenity', 'cafe'), ('name', '~Mc.*lds')]")
tag_strs = []
if isinstance(tags, str):
tag_str = tags
tag_strs = [tags]
else:
tag_str = ''
for t in tags:
if len(t) == 1:
q = '"{}"'.format(t[0])
elif t[1] is None or len(t[1]) == 0:
q = '"!{}"'.format(t[0])
elif t[1][0] == '~':
q = '"{}"~"{}"'.format(t[0], t[1][1:])
else:
q = '"{}"="{}"'.format(t[0], t[1])
tag_str += '[' + q + ']'
if isinstance(tags[0][0], str):
tags = [tags]
for tags_q in tags:
tag_str = ''
for t in tags_q:
if len(t) == 1:
q = '"{}"'.format(t[0])
elif t[1] is None or len(t[1]) == 0:
q = '"!{}"'.format(t[0])
elif t[1][0] == '~':
q = '"{}"~"{}",i'.format(t[0], t[1][1:])
elif len(t) > 2:
q = '"{}"~"^({})$"'.format(t[0], '|'.join(t[1:]))
else:
q = '"{}"="{}"'.format(t[0], t[1])
tag_str += '[' + q + ']'
tag_strs.append(tag_str)
timeout = self.profile.get('overpass_timeout', 120)
query = '[out:xml]{};('.format('' if timeout is None else '[timeout:{}]'.format(timeout))
for bbox in bboxes:
bbox_str = '' if bbox is None else '(' + ','.join([str(x) for x in bbox]) + ')'
for t in ('node', 'way', 'relation["type"="multipolygon"]'):
query += t + tag_str + bbox_str + ';'
for tag_str in tag_strs:
for t in ('node', 'way', 'relation["type"="multipolygon"]'):
query += t + tag_str + bbox_str + ';'
if self.ref is not None:
for t in ('node', 'way', 'relation'):
query += t + '["' + self.ref + '"];'
@ -347,21 +356,67 @@ class OsmConflator:
return [get_bbox(b, padding) for b in boxes]
def get_categories(self, tags):
def match_query(tags, query):
for tag in query:
if len(tag) == 1:
if tag[0] in tags:
return False
elif tag[1] is None or tag[1] == '':
if tag[0] not in tags:
return False
else:
value = tags.get(tag[0], None)
if value is None:
return False
found = False
for t2 in tag[1:]:
if t2[0] == '~':
m = re.search(t2[1:], value)
if not m:
return False
elif t2[0] == '!':
if t2[1:].lower() in value.lower():
found = True
elif t2 == value:
found = True
if found:
break
if not found:
return False
return True
def tags_to_query(tags):
return [(k, v) for k, v in tags.items()]
result = set()
qualifies = self.profile.get('qualifies', args=tags)
if qualifies is not None:
return qualifies
if qualifies:
result.add('')
return result
categories = ['']
# First check default query
query = self.profile.get('query', None)
if query is not None and not isinstance(query, str):
for tag in query:
if len(tag) >= 1:
if tag[0] not in tags:
return False
if len(tag) >= 2 and tag[1][0] != '~':
if tag[1] != tags[tag[0]]:
return False
return categories
if query is not None:
if isinstance(query, str):
result.add('')
else:
if isinstance(query[0][0], str):
query = [query]
for q in query:
if match_query(tags, q):
result.add('')
break
# Then check each category if we got these
categories = self.profile.get('categories', {})
for name, params in categories.items():
if 'tags' not in params and 'query' not in params:
raise ValueError('No tags and query attributes for category "{}"'.format(name))
if match_query(tags, params.get('query', tags_to_query(params.get('tags')))):
result.add(name)
return result
def download_osm(self):
"""Constructs an Overpass API query and requests objects
@ -457,7 +512,7 @@ class OsmConflator:
continue
pt = OSMPoint(
el.tag, int(el.get('id')), int(el.get('version')),
coord[0], coord[1], tags)
coord[0], coord[1], tags, categories)
pt.members = members
if pt.is_poi():
if callable(weight_fn):
@ -846,6 +901,20 @@ def read_dataset(profile, fileobj):
required='returns a list of SourcePoints with the dataset')
def add_categories_to_dataset(profile, dataset):
tag = profile.get('category_tag')
categories = profile.get('categories')
if not tag or not categories:
return
for d in dataset:
if tag in d.tags:
d.category = d.tags[tag]
del d.tags[tag]
cat_tags = categories.get(d.category, {}).get('tags', None)
if cat_tags:
d.tags.update(cat_tags)
def transform_dataset(profile, dataset):
"""Transforms tags in the dataset using the "transform" method in the profile
or the instructions in that field in string or dict form."""
@ -928,7 +997,7 @@ def write_for_filter(profile, dataset, f):
if len(part) == 1:
parts.append(part[0])
elif part[1] is None or len(part[1]) == 0:
parts.append('!'+part[0])
parts.append('{}='.format(part[0]))
elif part[1][0] == '~':
raise ValueError('Cannot use regular expressions in filter')
elif '|' in part[1] or ';' in part[1]:
@ -938,19 +1007,29 @@ def write_for_filter(profile, dataset, f):
result.append('|'.join(parts))
return result
def tags_to_query(tags):
return [(k, v) for k, v in tags.items()]
categories = profile.get('categories', {})
if None not in categories:
categories[None] = profile.get('query')
p_query = profile.get('query', None)
if p_query is not None:
categories[None] = {'query': p_query}
cat_map = {}
i = 0
for name, query in categories.items():
for tags in query_to_tag_strings(query):
f.write('{},{},{}\n'.format(i, name or '', tags))
cat_map[name] = i
i += 1
try:
for name, query in categories.items():
for tags in query_to_tag_strings(query.get('query', tags_to_query(query.get('tags')))):
f.write('{},{},{}\n'.format(i, name or '', tags))
cat_map[name] = i
i += 1
except ValueError as e:
logging.error(e)
return False
f.write('\n')
for d in dataset:
f.write('{},{},{}\n'.format(d.lon, d.lat, cat_map[d.category]))
if d.category in cat_map:
f.write('{},{},{}\n'.format(d.lon, d.lat, cat_map[d.category]))
return True
def run(profile=None):
@ -995,11 +1074,12 @@ def run(profile=None):
logging.error('Empty source dataset')
sys.exit(2)
transform_dataset(profile, dataset)
add_categories_to_dataset(profile, dataset)
logging.info('Read %s items from the dataset', len(dataset))
if options.for_filter:
write_for_filter(profile, dataset, options.for_filter)
logging.info('Prepared data for filtering, exitting')
if write_for_filter(profile, dataset, options.for_filter):
logging.info('Prepared data for filtering, exitting')
return
audit = None

View file

@ -45,7 +45,7 @@ bool AppendToVector(uint16_t cat_id, void *vec) {
class AmenityHandler : public osmium::handler::Handler {
constexpr static double kSearchRadius = 0.0001; // ~1 km TODO! revert to 0.01
constexpr static double kSearchRadius = 0.01;
typedef RTree<uint16_t, int32_t, 2, double> DatasetTree;
typedef std::vector<std::vector<std::string>> TQuery;
@ -78,8 +78,27 @@ class AmenityHandler : public osmium::handler::Handler {
}
bool TestTags(osmium::TagList const & tags, TQuery const & query) {
for (auto const & pair : query) {
// TODO
for (std::vector<std::string> const & pair : query) {
const char *value = tags[pair[0].c_str()];
if (pair.size() == 2 && pair[1].empty()) {
if (value != nullptr)
return false;
} else {
if (value == nullptr)
return false;
if (pair.size() > 1) {
// TODO: substrings?
bool found = false;
for (size_t i = 1; i < pair.size(); i++) {
if (!strcmp(value, pair[i].c_str())) {
found = true;
break;
}
}
if (!found)
return false;
}
}
}
return true;
}

View file

@ -8,7 +8,7 @@ source = 'auchan.ru'
# Not adding a ref:auchan tag, since we don't have good identifiers
no_dataset_id = True
# Using a name query with regular expressions
query = [('shop', '~supermarket|mall'), ('name', '~Ашан|АШАН')]
query = [('shop', 'supermarket', 'mall'), ('name', '~Ашан|АШАН')]
master_tags = ('name', 'opening_hours', 'phone', 'website')
# Empty dict so we don't add a fixme tag to unmatched objects
tag_unmatched = {}