# A web page with a list of shops in Moscow. You can replace it with one for another city download_url = 'https://www.auchan.ru/ru/moscow/' source = 'auchan.ru' # Not adding a ref:auchan tag, since we don't have good identifiers no_dataset_id = True # Using a name query with regular expressions query = [('shop', 'supermarket', 'mall'), ('name', '~Ашан|АШАН')] master_tags = ('name', 'opening_hours', 'phone', 'website') # Empty dict so we don't add a fixme tag to unmatched objects tag_unmatched = {} # Coordinates are VERY approximate, so increasing max distance to 1 km max_distance = 1000 # For some reason, functions here cannot use variables defined above # And defining them as "global" moves these from locals() to globals() download_url_copy = download_url def dataset(fileobj): def parse_weekdays(s): weekdays = {k: v for k, v in map(lambda x: x.split(), 'пн Mo,вт Tu,ср We,чт Th,пт Fr,сб Sa,вс Su'.split(','))} s = s.replace(' ', '').lower().replace('c', 'с') if s == 'ежедневно' or s == 'пн-вс': return '' parts = [] for x in s.split(','): p = None if x in weekdays: p = weekdays[x] elif '-' in x: m = re.match(r'(\w\w)-(\w\w)', x) if m: pts = [weekdays.get(m.group(i), None) for i in (1, 2)] if pts[0] and pts[1]: p = '-'.join(pts) if p: parts.append(p) else: logging.warning('Could not parse opening hours: %s', s) return None return ','.join(parts) # We are parsing HTML, and for that we need an lxml package from lxml import html import logging import re global download_url_copy, re h = html.fromstring(fileobj.read().decode('utf-8')) shops = h.find_class('shops-in-the-city-holder')[0] shops.make_links_absolute(download_url_copy) blocks = shops.xpath("//div[@class='mark-box'] | //ul[@class='shops-list']") name = None RE_GMAPS = re.compile(r'q=(-?[0-9.]+)\+(-?[0-9.]+)$') RE_OH = re.compile(r'(Ежедневно|(?:(?:Пн|Вт|Ср|Чт|Пт|Сб|В[сc])[, -]*)+)[ сc:]+(\d\d?[:.]\d\d)[- до]+(\d\d[.:]\d\d)', re.I) data = [] for block in blocks: if block.get('class') == 'mark-box': name = block.xpath("strong[contains(@class, 'name')]/text()")[0].replace('АШАН', 'Ашан') logging.debug('Name: %s', name) elif block.get('class') == 'shops-list': for li in block: title = li.xpath("strong[@class='title']/a/text()") title = title[0].lower() if title else None website = li.xpath("strong[@class='title']/a/@href") website = website[0] if website else None addr = li.xpath("p[1]/text()") addr = addr[0].strip() if addr else None lat = None lon = None gmapslink = li.xpath(".//a[contains(@href, 'maps.google')]/@href") if gmapslink: m = RE_GMAPS.search(gmapslink[0]) if m: lat = float(m.group(1)) lon = float(m.group(2)) opening_hours = [] # Extract opening hours oh = ' '.join(li.xpath("p/text()")) for m in RE_OH.finditer(oh): weekdays = parse_weekdays(m.group(1)) if weekdays is not None: opening_hours.append('{}{:0>5s}-{:0>5s}'.format( weekdays + ' ' if weekdays else '', m.group(2).replace('.', ':'), m.group(3).replace('.', ':'))) logging.debug('Found title: %s, website: %s, opens: %s, coords: %s, %s', title, website, '; '.join(opening_hours) or None, lat, lon) if lat is not None and name is not None: tags = { 'name': name, 'brand': 'Auchan', 'shop': 'supermarket', 'phone': '8-800-700-5-800', 'operator': 'ООО «АШАН»', 'opening_hours': '; '.join(opening_hours), 'addr:full': addr, 'website': website } data.append(SourcePoint(title, lat, lon, tags)) return data