From fdc4b242b0105d54515e35e43344b1d5f4e4a983 Mon Sep 17 00:00:00 2001
From: Ilya Zverev <zverik@textual.ru>
Date: Mon, 15 Jan 2018 19:36:32 +0300
Subject: [PATCH] Tag filter in cpp, better categories matching in conflate

---
 conflate/conflate.py             | 156 +++++++++++++++++++++++--------
 filter/filter_planet_by_cats.cpp |  25 ++++-
 profiles/auchan_moscow.py        |   2 +-
 3 files changed, 141 insertions(+), 42 deletions(-)

diff --git a/conflate/conflate.py b/conflate/conflate.py
index 14ef9fe..1fb7fcf 100755
--- a/conflate/conflate.py
+++ b/conflate/conflate.py
@@ -73,13 +73,14 @@ class OSMPoint(SourcePoint):
     """An OSM points is a SourcePoint with a few extra fields.
     Namely, version, members (for ways and relations), and an action.
     The id is compound and created from object type and object id."""
-    def __init__(self, ptype, pid, version, lat, lon, tags=None):
+    def __init__(self, ptype, pid, version, lat, lon, tags=None, categories=None):
         super().__init__('{}{}'.format(ptype[0], pid), lat, lon, tags)
         self.osm_type = ptype
         self.osm_id = pid
         self.version = version
         self.members = None
         self.action = None
+        self.categories = categories or set()
 
     def copy(self):
         """Returns a copy of this object, except for members field."""
@@ -207,27 +208,35 @@ class OsmConflator:
         (k, v) turns into [k=v], (k,) into [k], (k, None) into [!k], (k, "~v") into [k~v]."""
         tags = self.profile.get(
             'query', required="a list of tuples. E.g. [('amenity', 'cafe'), ('name', '~Mc.*lds')]")
+        tag_strs = []
         if isinstance(tags, str):
-            tag_str = tags
+            tag_strs = [tags]
         else:
-            tag_str = ''
-            for t in tags:
-                if len(t) == 1:
-                    q = '"{}"'.format(t[0])
-                elif t[1] is None or len(t[1]) == 0:
-                    q = '"!{}"'.format(t[0])
-                elif t[1][0] == '~':
-                    q = '"{}"~"{}"'.format(t[0], t[1][1:])
-                else:
-                    q = '"{}"="{}"'.format(t[0], t[1])
-                tag_str += '[' + q + ']'
+            if isinstance(tags[0][0], str):
+                tags = [tags]
+            for tags_q in tags:
+                tag_str = ''
+                for t in tags_q:
+                    if len(t) == 1:
+                        q = '"{}"'.format(t[0])
+                    elif t[1] is None or len(t[1]) == 0:
+                        q = '"!{}"'.format(t[0])
+                    elif t[1][0] == '~':
+                        q = '"{}"~"{}",i'.format(t[0], t[1][1:])
+                    elif len(t) > 2:
+                        q = '"{}"~"^({})$"'.format(t[0], '|'.join(t[1:]))
+                    else:
+                        q = '"{}"="{}"'.format(t[0], t[1])
+                    tag_str += '[' + q + ']'
+                tag_strs.append(tag_str)
 
         timeout = self.profile.get('overpass_timeout', 120)
         query = '[out:xml]{};('.format('' if timeout is None else '[timeout:{}]'.format(timeout))
         for bbox in bboxes:
             bbox_str = '' if bbox is None else '(' + ','.join([str(x) for x in bbox]) + ')'
-            for t in ('node', 'way', 'relation["type"="multipolygon"]'):
-                query += t + tag_str + bbox_str + ';'
+            for tag_str in tag_strs:
+                for t in ('node', 'way', 'relation["type"="multipolygon"]'):
+                    query += t + tag_str + bbox_str + ';'
         if self.ref is not None:
             for t in ('node', 'way', 'relation'):
                 query += t + '["' + self.ref + '"];'
@@ -347,21 +356,67 @@ class OsmConflator:
         return [get_bbox(b, padding) for b in boxes]
 
     def get_categories(self, tags):
+        def match_query(tags, query):
+            for tag in query:
+                if len(tag) == 1:
+                    if tag[0] in tags:
+                        return False
+                    elif tag[1] is None or tag[1] == '':
+                        if tag[0] not in tags:
+                            return False
+                    else:
+                        value = tags.get(tag[0], None)
+                        if value is None:
+                            return False
+                        found = False
+                        for t2 in tag[1:]:
+                            if t2[0] == '~':
+                                m = re.search(t2[1:], value)
+                                if not m:
+                                    return False
+                            elif t2[0] == '!':
+                                if t2[1:].lower() in value.lower():
+                                    found = True
+                            elif t2 == value:
+                                found = True
+                            if found:
+                                break
+                        if not found:
+                            return False
+            return True
+
+        def tags_to_query(tags):
+            return [(k, v) for k, v in tags.items()]
+
+        result = set()
         qualifies = self.profile.get('qualifies', args=tags)
         if qualifies is not None:
-            return qualifies
+            if qualifies:
+                result.add('')
+            return result
 
-        categories = ['']
+        # First check default query
         query = self.profile.get('query', None)
-        if query is not None and not isinstance(query, str):
-            for tag in query:
-                if len(tag) >= 1:
-                    if tag[0] not in tags:
-                        return False
-                    if len(tag) >= 2 and tag[1][0] != '~':
-                        if tag[1] != tags[tag[0]]:
-                            return False
-        return categories
+        if query is not None:
+            if isinstance(query, str):
+                result.add('')
+            else:
+                if isinstance(query[0][0], str):
+                    query = [query]
+                for q in query:
+                    if match_query(tags, q):
+                        result.add('')
+                        break
+
+        # Then check each category if we got these
+        categories = self.profile.get('categories', {})
+        for name, params in categories.items():
+            if 'tags' not in params and 'query' not in params:
+                raise ValueError('No tags and query attributes for category "{}"'.format(name))
+            if match_query(tags, params.get('query', tags_to_query(params.get('tags')))):
+                result.add(name)
+
+        return result
 
     def download_osm(self):
         """Constructs an Overpass API query and requests objects
@@ -457,7 +512,7 @@ class OsmConflator:
                 continue
             pt = OSMPoint(
                 el.tag, int(el.get('id')), int(el.get('version')),
-                coord[0], coord[1], tags)
+                coord[0], coord[1], tags, categories)
             pt.members = members
             if pt.is_poi():
                 if callable(weight_fn):
@@ -846,6 +901,20 @@ def read_dataset(profile, fileobj):
         required='returns a list of SourcePoints with the dataset')
 
 
+def add_categories_to_dataset(profile, dataset):
+    tag = profile.get('category_tag')
+    categories = profile.get('categories')
+    if not tag or not categories:
+        return
+    for d in dataset:
+        if tag in d.tags:
+            d.category = d.tags[tag]
+            del d.tags[tag]
+            cat_tags = categories.get(d.category, {}).get('tags', None)
+            if cat_tags:
+                d.tags.update(cat_tags)
+
+
 def transform_dataset(profile, dataset):
     """Transforms tags in the dataset using the "transform" method in the profile
     or the instructions in that field in string or dict form."""
@@ -928,7 +997,7 @@ def write_for_filter(profile, dataset, f):
                 if len(part) == 1:
                     parts.append(part[0])
                 elif part[1] is None or len(part[1]) == 0:
-                    parts.append('!'+part[0])
+                    parts.append('{}='.format(part[0]))
                 elif part[1][0] == '~':
                     raise ValueError('Cannot use regular expressions in filter')
                 elif '|' in part[1] or ';' in part[1]:
@@ -938,19 +1007,29 @@ def write_for_filter(profile, dataset, f):
             result.append('|'.join(parts))
         return result
 
+    def tags_to_query(tags):
+        return [(k, v) for k, v in tags.items()]
+
     categories = profile.get('categories', {})
-    if None not in categories:
-        categories[None] = profile.get('query')
+    p_query = profile.get('query', None)
+    if p_query is not None:
+        categories[None] = {'query': p_query}
     cat_map = {}
     i = 0
-    for name, query in categories.items():
-        for tags in query_to_tag_strings(query):
-            f.write('{},{},{}\n'.format(i, name or '', tags))
-        cat_map[name] = i
-        i += 1
+    try:
+        for name, query in categories.items():
+            for tags in query_to_tag_strings(query.get('query', tags_to_query(query.get('tags')))):
+                f.write('{},{},{}\n'.format(i, name or '', tags))
+            cat_map[name] = i
+            i += 1
+    except ValueError as e:
+        logging.error(e)
+        return False
     f.write('\n')
     for d in dataset:
-        f.write('{},{},{}\n'.format(d.lon, d.lat, cat_map[d.category]))
+        if d.category in cat_map:
+            f.write('{},{},{}\n'.format(d.lon, d.lat, cat_map[d.category]))
+    return True
 
 
 def run(profile=None):
@@ -995,11 +1074,12 @@ def run(profile=None):
         logging.error('Empty source dataset')
         sys.exit(2)
     transform_dataset(profile, dataset)
+    add_categories_to_dataset(profile, dataset)
     logging.info('Read %s items from the dataset', len(dataset))
 
     if options.for_filter:
-        write_for_filter(profile, dataset, options.for_filter)
-        logging.info('Prepared data for filtering, exitting')
+        if write_for_filter(profile, dataset, options.for_filter):
+            logging.info('Prepared data for filtering, exitting')
         return
 
     audit = None
diff --git a/filter/filter_planet_by_cats.cpp b/filter/filter_planet_by_cats.cpp
index 9f21468..8ed9304 100644
--- a/filter/filter_planet_by_cats.cpp
+++ b/filter/filter_planet_by_cats.cpp
@@ -45,7 +45,7 @@ bool AppendToVector(uint16_t cat_id, void *vec) {
 
 class AmenityHandler : public osmium::handler::Handler {
 
-  constexpr static double kSearchRadius = 0.0001; // ~1 km TODO! revert to 0.01
+  constexpr static double kSearchRadius = 0.01;
 
   typedef RTree<uint16_t, int32_t, 2, double> DatasetTree;
   typedef std::vector<std::vector<std::string>> TQuery;
@@ -78,8 +78,27 @@ class AmenityHandler : public osmium::handler::Handler {
   }
 
   bool TestTags(osmium::TagList const & tags, TQuery const & query) {
-    for (auto const & pair : query) {
-      // TODO
+    for (std::vector<std::string> const & pair : query) {
+      const char *value = tags[pair[0].c_str()];
+      if (pair.size() == 2 && pair[1].empty()) {
+        if (value != nullptr)
+          return false;
+      } else {
+        if (value == nullptr)
+          return false;
+        if (pair.size() > 1) {
+          // TODO: substrings?
+          bool found = false;
+          for (size_t i = 1; i < pair.size(); i++) {
+            if (!strcmp(value, pair[i].c_str())) {
+              found = true;
+              break;
+            }
+          }
+          if (!found)
+            return false;
+        }
+      }
     }
     return true;
   }
diff --git a/profiles/auchan_moscow.py b/profiles/auchan_moscow.py
index a6e8625..5e720cf 100644
--- a/profiles/auchan_moscow.py
+++ b/profiles/auchan_moscow.py
@@ -8,7 +8,7 @@ source = 'auchan.ru'
 # Not adding a ref:auchan tag, since we don't have good identifiers
 no_dataset_id = True
 # Using a name query with regular expressions
-query = [('shop', '~supermarket|mall'), ('name', '~Ашан|АШАН')]
+query = [('shop', 'supermarket', 'mall'), ('name', '~Ашан|АШАН')]
 master_tags = ('name', 'opening_hours', 'phone', 'website')
 # Empty dict so we don't add a fixme tag to unmatched objects
 tag_unmatched = {}