diff --git a/profiles/azbuka.py b/profiles/azbuka.py new file mode 100755 index 0000000..c7c547d --- /dev/null +++ b/profiles/azbuka.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +import conflate +import requests +import logging +import re +from io import BytesIO +from yandex_parser import parse_feed + + +class Profile: + source = 'Азбука Вкуса' + dataset_id = 'av' + query = [('shop', 'convenience', 'supermarket', 'wine', 'alcohol')] + master_tags = ('operator', 'shop', 'opening_hours', 'name', 'contact:website', 'contact:phone') + download_url = 'https://av.ru/yandex/supermarket.xml' + + def matches(osmtags, avtags): + if 'Энотека' in avtags['name']: + return osmtags.get('shop') in ('wine', 'alcohol') + name = osmtags.get('name') + if osmtags.get('shop') not in ('convenience', 'supermarket'): + return False + if not name or re.search(r'AB|АВ|Азбука|Daily', name, re.I): + return True + if name.upper() in ('SPAR', 'СПАР') or 'континент' in name.lower(): + return True + return False + + def dataset(fileobj): + data = [] + other_urls = [ + None, + 'http://av.ru/yandex/market.xml', + 'http://av.ru/yandex/daily.xml', + 'http://av.ru/yandex/enoteka.xml', + ] + for url in other_urls: + if url: + r = requests.get(url) + if r.status_code != 200: + logging.error('Could not download source data: %s %s', r.status_code, r.text) + return None + f = BytesIO(r.content) + else: + f = fileobj + for c in parse_feed(f): + name = next(iter(c.name.values())) + tags = { + 'name': name, + 'operator': 'ООО «Городской супермаркет»', + 'contact:phone': '; '.join(c.phones) or None, + 'contact:website': c.url_add, + 'opening_hours': c.opening_hours, + } + if 'Энотека' in name: + tags['shop'] = 'wine' + elif 'Daily' in name: + tags['shop'] = 'convenience' + else: + tags['shop'] = 'supermarket' + data.append(conflate.SourcePoint(c.id, c.lat, c.lon, tags)) + return data + + +if __name__ == '__main__': + conflate.run(Profile) diff --git a/profiles/yandex_parser.py b/profiles/yandex_parser.py new file mode 100644 index 0000000..d461372 --- /dev/null +++ b/profiles/yandex_parser.py @@ -0,0 +1,139 @@ +from lxml import etree +import logging +import re +import phonenumbers # https://pypi.python.org/pypi/phonenumberslite + + +class Company: + def __init__(self, cid): + self.id = cid + self.name = {} + self.alt_name = {} + self.address = {} + self.country = {} + self.address_add = {} + self.opening_hours = None + self.url = None + self.url_add = None + self.url_ext = None + self.email = None + self.rubric = [] + self.phones = [] + self.faxes = [] + self.photos = [] + self.lat = None + self.lon = None + self.other = {} + + +def parse_feed(f): + def multilang(c, name): + for el in company.findall(name): + lang = el.get('lang', 'default') + value = el.text + if value and len(value.strip()) > 0: + c[lang] = value.strip() + + def parse_subels(el): + res = {} + if el is None: + return res + for subel in el: + name = subel.tag + text = subel.text + if text and text.strip(): + res[name] = text + return res + + def parse_opening_hours(s): + if 'углосуточн' in s: + return '24/7' + m = re.search(r'([01]?\d:\d\d).*?([12]?\d:\d\d)', s) + if m: + # TODO: parse weekdays + start = m.group(1) + start = re.sub(r'^(\d:)', r'0\1', start) + end = m.group(2) + end = re.sub(r'0?0:', '24:', end) + return 'Mo-Su {}-{}'.format(start, end) + # TODO + return None + + xml = etree.parse(f).getroot() + if xml.tag != 'companies': + logging.error('Root node must be named "companies", not %s', xml.tag) + for company in xml: + if company.tag != 'company': + logging.warn('Non-company in yandex xml: %s', company.tag) + continue + cid = company.find('company-id') + if cid is None or not cid.text: + logging.error('No id for a company') + continue + c = Company(cid.text.strip()) + multilang(c.name, 'name') + multilang(c.alt_name, 'name-other') + multilang(c.address, 'address') + loc = {} + multilang(loc, 'locality-name') + if loc: + for lng, place in loc.items(): + if lng in c.address: + c.address = place + ', ' + c.address + multilang(c.address_add, 'address-add') + multilang(c.country, 'country') + coord = parse_subels(company.find('coordinates')) + if 'lat' in coord and 'lon' in coord: + c.lat = float(coord['lat']) + c.lon = float(coord['lon']) + else: + logging.warn('No coordinates for %s', c.id) + continue + for ph in company.findall('phone'): + phone = parse_subels(ph) + if 'number' not in phone: + continue + parsed_phone = phonenumbers.parse(phone['number'], 'RU') + number = phonenumbers.format_number( + parsed_phone, phonenumbers.PhoneNumberFormat.INTERNATIONAL) + if 'ext' in phone: + number += ' ext. ' + phone['ext'] + typ = phone.get('type', 'phone') + if typ == 'fax': + c.faxes.append(number) + else: + c.phones.append(number) + email = company.find('email') + if email is not None and email.text: + c.email = email.text.strip() + url = company.find('url') + if url is not None and url.text: + c.url = url.text.strip() + url_add = company.find('add-url') + if url_add is not None and url_add.text: + c.url_add = url_add.text.strip() + url_ext = company.find('info-page') + if url_ext is not None and url_ext.text: + c.url_ext = url_ext.text.strip() + for rub in company.findall('rubric-rd'): + if rub.text: + c.rubric.append(int(rub.text.strip())) + coh = company.find('working-time') + if coh is not None and coh.text: + c.opening_hours = parse_opening_hours(coh.text) + photos = company.find('photos') + if photos is not None: + for photo in photos: + if photo.get('type', 'interior') != 'food': + c.photos.append(photo.get('url')) + for feat in company: + if feat.tag.startswith('feature-'): + name = feat.get('name', None) + value = feat.get('value', None) + if name is not None and value is not None: + if feat.tag == 'feature-boolean': + value = value == '1' + elif '-numeric' in feat.tag: + value = float(value) + c.other[name] = value + yield c