Another example with HTML parsing

2017-02-16 14:19:48 +03:00 · 2017-02-16 14:19:48 +03:00 · a2cdbf3773
commit a2cdbf3773
parent cd8c9b6e2f
2 changed files with 80 additions and 4 deletions
--- a/conflate.py
+++ b/conflate.py
@ -216,7 +216,13 @@ class OsmConflator:
            logging.debug('Overpass query: %s', query)
            r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query})
            if r.status_code != 200:
-                raise IOError('Failed to download data from Overpass API: {} {}\nQuery: {}'.format(r.status_code, r.text, query))
+                logging.error('Failed to download data from Overpass API: %s', r.status_code)
+                logging.error('Query: %s', query)
+                logging.error('Error message: %s', r.text)
+                if 'rate_limited' in r.text:
+                    r = requests.get(OVERPASS_SERVER + 'status')
+                    logging.warning('Seems like you are rate limited. API status:\n%s', r.text)
+                raise IOError()
            for el in r.json()['elements']:
                if 'tags' not in el:
                    continue
@ -299,8 +305,12 @@ class OsmConflator:
                changed = False
                for k, v in sp.tags.items():
                    if k not in p.tags or (k in master_tags and p.tags[k] != v):
-                        p.tags[k] = v
-                        changed = True
+                        if v is not None:
+                            p.tags[k] = v
+                            changed = True
+                        elif k in p.tags:
+                            del p.tags[k]
+                            changed = True
                if changed:
                    p.action = 'modify'
                    # If not, action is None and we're not including this object into the osmChange
@ -309,7 +319,7 @@ class OsmConflator:
            if self.ref is not None:
                p.tags[self.ref] = sp.id
        elif keep or p.is_area():
-            if not retag:
+            if retag is None:
                retag = {'fixme': DELETED_FIXME}
            for k, v in retag.items():
                if v is not None:
--- a/profiles/auchan_moscow.py
+++ b/profiles/auchan_moscow.py
@ -0,0 +1,66 @@
+# This profile requires lxml package
+import logging
+import re
+
+# A web page with a list of shops in Moscow. You can replace it with one for another city
+download_url = 'https://www.auchan.ru/ru/moscow/'
+source = 'auchan.ru'
+# Not adding a ref:auchan tag, since we don't have good identifiers
+no_dataset_id = True
+# Using a name query with regular expressions
+query = [('shop', 'supermarket'), ('name', '~Ашан|АШАН')]
+# We don't parse opening hours yet, but it'd be cool if we did
+master_tags = set(('name', 'opening_hours', 'phone'))
+# Empty dict so we don't add a fixme tag to unmatched objects
+tag_unmatched = {}
+# Coordinates are VERY approximate, so increasing max distance to ~1 km
+max_distance = 0.01
+
+# For some reason, functions here cannot use variables defined above
+# And defining them as "global" moves these from locals() to globals()
+download_url_copy = download_url
+def dataset(fileobj):
+    # We are parsing HTML, and for that we need an lxml package
+    from lxml import html
+    global download_url_copy
+    h = html.fromstring(fileobj.read().decode('utf-8'))
+    shops = h.find_class('shops-in-the-city-holder')[0]
+    shops.make_links_absolute(download_url_copy)
+    blocks = shops.xpath("//div[@class='mark-box'] | //ul[@class='shops-list']")
+    logging.debug('Found %s blocks', len(blocks))
+    name = None
+    RE_GMAPS = re.compile(r'q=(-?[0-9.]+)\+(-?[0-9.]+)$')
+    data = []
+    for block in blocks:
+        if block.get('class') == 'mark-box':
+            name = block.xpath("strong[contains(@class, 'name')]/text()")[0].replace('АШАН', 'Ашан')
+            logging.debug('Name: %s', name)
+        elif block.get('class') == 'shops-list':
+            for li in block:
+                title = li.xpath("strong[@class='title']/a/text()")
+                title = title[0].lower() if title else None
+                website = li.xpath("strong[@class='title']/a/@href")
+                website = website[0] if website else None
+                addr = li.xpath("p[1]/text()")
+                addr = addr[0].strip() if addr else None
+                lat = None
+                lon = None
+                gmapslink = li.xpath(".//a[contains(@href, 'maps.google')]/@href")
+                if gmapslink:
+                    m = RE_GMAPS.search(gmapslink[0])
+                    if m:
+                        lat = float(m.group(1))
+                        lon = float(m.group(2))
+                logging.debug('Found title: %s, website: %s, address: %s, coords: %s, %s', title, website, addr, lat, lon)
+                if lat is not None and name is not None:
+                    tags = {
+                        'name': name,
+                        'brand': 'Auchan',
+                        'shop': 'supermarket',
+                        'phone': '8-800-700-5-800',
+                        'operator': 'ООО «АШАН»',
+                        'addr:full': addr,
+                        'website': website
+                    }
+                    data.append(SourcePoint(title, lat, lon, tags))
+    return data