Add a profile for Azbuka Vkusa
This commit is contained in:
parent
2786513ba7
commit
43df2ac79a
2 changed files with 205 additions and 0 deletions
66
profiles/azbuka.py
Executable file
66
profiles/azbuka.py
Executable file
|
@ -0,0 +1,66 @@
|
|||
#!/usr/bin/env python3
|
||||
import conflate
|
||||
import requests
|
||||
import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from yandex_parser import parse_feed
|
||||
|
||||
|
||||
class Profile:
|
||||
source = 'Азбука Вкуса'
|
||||
dataset_id = 'av'
|
||||
query = [('shop', 'convenience', 'supermarket', 'wine', 'alcohol')]
|
||||
master_tags = ('operator', 'shop', 'opening_hours', 'name', 'contact:website', 'contact:phone')
|
||||
download_url = 'https://av.ru/yandex/supermarket.xml'
|
||||
|
||||
def matches(osmtags, avtags):
|
||||
if 'Энотека' in avtags['name']:
|
||||
return osmtags.get('shop') in ('wine', 'alcohol')
|
||||
name = osmtags.get('name')
|
||||
if osmtags.get('shop') not in ('convenience', 'supermarket'):
|
||||
return False
|
||||
if not name or re.search(r'AB|АВ|Азбука|Daily', name, re.I):
|
||||
return True
|
||||
if name.upper() in ('SPAR', 'СПАР') or 'континент' in name.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
def dataset(fileobj):
|
||||
data = []
|
||||
other_urls = [
|
||||
None,
|
||||
'http://av.ru/yandex/market.xml',
|
||||
'http://av.ru/yandex/daily.xml',
|
||||
'http://av.ru/yandex/enoteka.xml',
|
||||
]
|
||||
for url in other_urls:
|
||||
if url:
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
logging.error('Could not download source data: %s %s', r.status_code, r.text)
|
||||
return None
|
||||
f = BytesIO(r.content)
|
||||
else:
|
||||
f = fileobj
|
||||
for c in parse_feed(f):
|
||||
name = next(iter(c.name.values()))
|
||||
tags = {
|
||||
'name': name,
|
||||
'operator': 'ООО «Городской супермаркет»',
|
||||
'contact:phone': '; '.join(c.phones) or None,
|
||||
'contact:website': c.url_add,
|
||||
'opening_hours': c.opening_hours,
|
||||
}
|
||||
if 'Энотека' in name:
|
||||
tags['shop'] = 'wine'
|
||||
elif 'Daily' in name:
|
||||
tags['shop'] = 'convenience'
|
||||
else:
|
||||
tags['shop'] = 'supermarket'
|
||||
data.append(conflate.SourcePoint(c.id, c.lat, c.lon, tags))
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
conflate.run(Profile)
|
139
profiles/yandex_parser.py
Normal file
139
profiles/yandex_parser.py
Normal file
|
@ -0,0 +1,139 @@
|
|||
from lxml import etree
|
||||
import logging
|
||||
import re
|
||||
import phonenumbers # https://pypi.python.org/pypi/phonenumberslite
|
||||
|
||||
|
||||
class Company:
|
||||
def __init__(self, cid):
|
||||
self.id = cid
|
||||
self.name = {}
|
||||
self.alt_name = {}
|
||||
self.address = {}
|
||||
self.country = {}
|
||||
self.address_add = {}
|
||||
self.opening_hours = None
|
||||
self.url = None
|
||||
self.url_add = None
|
||||
self.url_ext = None
|
||||
self.email = None
|
||||
self.rubric = []
|
||||
self.phones = []
|
||||
self.faxes = []
|
||||
self.photos = []
|
||||
self.lat = None
|
||||
self.lon = None
|
||||
self.other = {}
|
||||
|
||||
|
||||
def parse_feed(f):
|
||||
def multilang(c, name):
|
||||
for el in company.findall(name):
|
||||
lang = el.get('lang', 'default')
|
||||
value = el.text
|
||||
if value and len(value.strip()) > 0:
|
||||
c[lang] = value.strip()
|
||||
|
||||
def parse_subels(el):
|
||||
res = {}
|
||||
if el is None:
|
||||
return res
|
||||
for subel in el:
|
||||
name = subel.tag
|
||||
text = subel.text
|
||||
if text and text.strip():
|
||||
res[name] = text
|
||||
return res
|
||||
|
||||
def parse_opening_hours(s):
|
||||
if 'углосуточн' in s:
|
||||
return '24/7'
|
||||
m = re.search(r'([01]?\d:\d\d).*?([12]?\d:\d\d)', s)
|
||||
if m:
|
||||
# TODO: parse weekdays
|
||||
start = m.group(1)
|
||||
start = re.sub(r'^(\d:)', r'0\1', start)
|
||||
end = m.group(2)
|
||||
end = re.sub(r'0?0:', '24:', end)
|
||||
return 'Mo-Su {}-{}'.format(start, end)
|
||||
# TODO
|
||||
return None
|
||||
|
||||
xml = etree.parse(f).getroot()
|
||||
if xml.tag != 'companies':
|
||||
logging.error('Root node must be named "companies", not %s', xml.tag)
|
||||
for company in xml:
|
||||
if company.tag != 'company':
|
||||
logging.warn('Non-company in yandex xml: %s', company.tag)
|
||||
continue
|
||||
cid = company.find('company-id')
|
||||
if cid is None or not cid.text:
|
||||
logging.error('No id for a company')
|
||||
continue
|
||||
c = Company(cid.text.strip())
|
||||
multilang(c.name, 'name')
|
||||
multilang(c.alt_name, 'name-other')
|
||||
multilang(c.address, 'address')
|
||||
loc = {}
|
||||
multilang(loc, 'locality-name')
|
||||
if loc:
|
||||
for lng, place in loc.items():
|
||||
if lng in c.address:
|
||||
c.address = place + ', ' + c.address
|
||||
multilang(c.address_add, 'address-add')
|
||||
multilang(c.country, 'country')
|
||||
coord = parse_subels(company.find('coordinates'))
|
||||
if 'lat' in coord and 'lon' in coord:
|
||||
c.lat = float(coord['lat'])
|
||||
c.lon = float(coord['lon'])
|
||||
else:
|
||||
logging.warn('No coordinates for %s', c.id)
|
||||
continue
|
||||
for ph in company.findall('phone'):
|
||||
phone = parse_subels(ph)
|
||||
if 'number' not in phone:
|
||||
continue
|
||||
parsed_phone = phonenumbers.parse(phone['number'], 'RU')
|
||||
number = phonenumbers.format_number(
|
||||
parsed_phone, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
|
||||
if 'ext' in phone:
|
||||
number += ' ext. ' + phone['ext']
|
||||
typ = phone.get('type', 'phone')
|
||||
if typ == 'fax':
|
||||
c.faxes.append(number)
|
||||
else:
|
||||
c.phones.append(number)
|
||||
email = company.find('email')
|
||||
if email is not None and email.text:
|
||||
c.email = email.text.strip()
|
||||
url = company.find('url')
|
||||
if url is not None and url.text:
|
||||
c.url = url.text.strip()
|
||||
url_add = company.find('add-url')
|
||||
if url_add is not None and url_add.text:
|
||||
c.url_add = url_add.text.strip()
|
||||
url_ext = company.find('info-page')
|
||||
if url_ext is not None and url_ext.text:
|
||||
c.url_ext = url_ext.text.strip()
|
||||
for rub in company.findall('rubric-rd'):
|
||||
if rub.text:
|
||||
c.rubric.append(int(rub.text.strip()))
|
||||
coh = company.find('working-time')
|
||||
if coh is not None and coh.text:
|
||||
c.opening_hours = parse_opening_hours(coh.text)
|
||||
photos = company.find('photos')
|
||||
if photos is not None:
|
||||
for photo in photos:
|
||||
if photo.get('type', 'interior') != 'food':
|
||||
c.photos.append(photo.get('url'))
|
||||
for feat in company:
|
||||
if feat.tag.startswith('feature-'):
|
||||
name = feat.get('name', None)
|
||||
value = feat.get('value', None)
|
||||
if name is not None and value is not None:
|
||||
if feat.tag == 'feature-boolean':
|
||||
value = value == '1'
|
||||
elif '-numeric' in feat.tag:
|
||||
value = float(value)
|
||||
c.other[name] = value
|
||||
yield c
|
Loading…
Add table
Reference in a new issue