Update gitignore and parse opening hours for Auchan

2017-02-17 19:36:03 +03:00 · 2017-02-17 19:36:03 +03:00 · 392067cc9e
commit 392067cc9e
parent 7ffa9222a0
3 changed files with 42 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
 *.swp
 *.osc
 *.zip
+*.json
+*.gz
+*.csv
+private/
--- a/conflate.py
+++ b/conflate.py
@ -225,11 +225,11 @@ class OsmConflator:
            r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query})
            if r.status_code != 200:
                logging.error('Failed to download data from Overpass API: %s', r.status_code)
-                logging.error('Query: %s', query)
-                logging.error('Error message: %s', r.text)
                if 'rate_limited' in r.text:
                    r = requests.get(OVERPASS_SERVER + 'status')
                    logging.warning('Seems like you are rate limited. API status:\n%s', r.text)
+                else:
+                    logging.error('Error message: %s', r.text)
                raise IOError()
            for el in r.json()['elements']:
                if 'tags' not in el:
--- a/profiles/auchan_moscow.py
+++ b/profiles/auchan_moscow.py
@ -8,9 +8,8 @@ source = 'auchan.ru'
 # Not adding a ref:auchan tag, since we don't have good identifiers
 no_dataset_id = True
 # Using a name query with regular expressions
-query = [('shop', 'supermarket'), ('name', '~Ашан|АШАН')]
-# We don't parse opening hours yet, but it'd be cool if we did
-master_tags = ('name', 'opening_hours', 'phone')
+query = [('shop', '~supermarket|mall'), ('name', '~Ашан|АШАН')]
+master_tags = ('name', 'opening_hours', 'phone', 'website')
 # Empty dict so we don't add a fixme tag to unmatched objects
 tag_unmatched = {}
 # Coordinates are VERY approximate, so increasing max distance to ~1 km
@ -20,6 +19,29 @@ max_distance = 0.01
 # And defining them as "global" moves these from locals() to globals()
 download_url_copy = download_url
 def dataset(fileobj):
+    def parse_weekdays(s):
+        weekdays = {k: v for k, v in map(lambda x: x.split(), 'пн Mo,вт Tu,ср We,чт Th,пт Fr,сб Sa,вс Su'.split(','))}
+        s = s.replace(' ', '').lower().replace('c', 'с')
+        if s == 'ежедневно' or s == 'пн-вс':
+            return ''
+        parts = []
+        for x in s.split(','):
+            p = None
+            if x in weekdays:
+                p = weekdays[x]
+            elif '-' in x:
+                m = re.match(r'(\w\w)-(\w\w)', x)
+                if m:
+                    pts = [weekdays.get(m.group(i), None) for i in (1, 2)]
+                    if pts[0] and pts[1]:
+                        p = '-'.join(pts)
+            if p:
+                parts.append(p)
+            else:
+                logging.warning('Could not parse opening hours: %s', s)
+                return None
+        return ','.join(parts)
+
    # We are parsing HTML, and for that we need an lxml package
    from lxml import html
    global download_url_copy
@ -27,9 +49,9 @@ def dataset(fileobj):
    shops = h.find_class('shops-in-the-city-holder')[0]
    shops.make_links_absolute(download_url_copy)
    blocks = shops.xpath("//div[@class='mark-box'] | //ul[@class='shops-list']")
-    logging.debug('Found %s blocks', len(blocks))
    name = None
    RE_GMAPS = re.compile(r'q=(-?[0-9.]+)\+(-?[0-9.]+)$')
+    RE_OH = re.compile(r'(Ежедневно|(?:(?:Пн|Вт|Ср|Чт|Пт|Сб|В[сc])[, -]*)+)[ сc:]+(\d\d?[:.]\d\d)[- до]+(\d\d[.:]\d\d)', re.I)
    data = []
    for block in blocks:
        if block.get('class') == 'mark-box':
@ -51,7 +73,15 @@ def dataset(fileobj):
                    if m:
                        lat = float(m.group(1))
                        lon = float(m.group(2))
-                logging.debug('Found title: %s, website: %s, address: %s, coords: %s, %s', title, website, addr, lat, lon)
+                opening_hours = []
+                # Extract opening hours
+                oh = ' '.join(li.xpath("p/text()"))
+                for m in RE_OH.finditer(oh):
+                    weekdays = parse_weekdays(m.group(1))
+                    if weekdays is not None:
+                        opening_hours.append('{}{:0>5s}-{:0>5s}'.format(
+                            weekdays + ' ' if weekdays else '', m.group(2).replace('.', ':'), m.group(3).replace('.', ':')))
+                logging.debug('Found title: %s, website: %s, opens: %s, coords: %s, %s', title, website, '; '.join(opening_hours) or None, lat, lon)
                if lat is not None and name is not None:
                    tags = {
                        'name': name,
@ -59,6 +89,7 @@ def dataset(fileobj):
                        'shop': 'supermarket',
                        'phone': '8-800-700-5-800',
                        'operator': 'ООО «АШАН»',
+                        'opening_hours': '; '.join(opening_hours),
                        'addr:full': addr,
                        'website': website
                    }