Update gitignore and parse opening hours for Auchan

This commit is contained in:
Ilya Zverev 2017-02-17 19:36:03 +03:00
parent 7ffa9222a0
commit 392067cc9e
3 changed files with 42 additions and 7 deletions

4
.gitignore vendored
View file

@ -1,3 +1,7 @@
*.swp
*.osc
*.zip
*.json
*.gz
*.csv
private/

View file

@ -225,11 +225,11 @@ class OsmConflator:
r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query})
if r.status_code != 200:
logging.error('Failed to download data from Overpass API: %s', r.status_code)
logging.error('Query: %s', query)
logging.error('Error message: %s', r.text)
if 'rate_limited' in r.text:
r = requests.get(OVERPASS_SERVER + 'status')
logging.warning('Seems like you are rate limited. API status:\n%s', r.text)
else:
logging.error('Error message: %s', r.text)
raise IOError()
for el in r.json()['elements']:
if 'tags' not in el:

View file

@ -8,9 +8,8 @@ source = 'auchan.ru'
# Not adding a ref:auchan tag, since we don't have good identifiers
no_dataset_id = True
# Using a name query with regular expressions
query = [('shop', 'supermarket'), ('name', '~Ашан|АШАН')]
# We don't parse opening hours yet, but it'd be cool if we did
master_tags = ('name', 'opening_hours', 'phone')
query = [('shop', '~supermarket|mall'), ('name', '~Ашан|АШАН')]
master_tags = ('name', 'opening_hours', 'phone', 'website')
# Empty dict so we don't add a fixme tag to unmatched objects
tag_unmatched = {}
# Coordinates are VERY approximate, so increasing max distance to ~1 km
@ -20,6 +19,29 @@ max_distance = 0.01
# And defining them as "global" moves these from locals() to globals()
download_url_copy = download_url
def dataset(fileobj):
def parse_weekdays(s):
weekdays = {k: v for k, v in map(lambda x: x.split(), 'пн Mo,вт Tu,ср We,чт Th,пт Fr,сб Sa,вс Su'.split(','))}
s = s.replace(' ', '').lower().replace('c', 'с')
if s == 'ежедневно' or s == 'пн-вс':
return ''
parts = []
for x in s.split(','):
p = None
if x in weekdays:
p = weekdays[x]
elif '-' in x:
m = re.match(r'(\w\w)-(\w\w)', x)
if m:
pts = [weekdays.get(m.group(i), None) for i in (1, 2)]
if pts[0] and pts[1]:
p = '-'.join(pts)
if p:
parts.append(p)
else:
logging.warning('Could not parse opening hours: %s', s)
return None
return ','.join(parts)
# We are parsing HTML, and for that we need an lxml package
from lxml import html
global download_url_copy
@ -27,9 +49,9 @@ def dataset(fileobj):
shops = h.find_class('shops-in-the-city-holder')[0]
shops.make_links_absolute(download_url_copy)
blocks = shops.xpath("//div[@class='mark-box'] | //ul[@class='shops-list']")
logging.debug('Found %s blocks', len(blocks))
name = None
RE_GMAPS = re.compile(r'q=(-?[0-9.]+)\+(-?[0-9.]+)$')
RE_OH = re.compile(r'(Ежедневно|(?:(?:Пн|Вт|Ср|Чт|Пт|Сб|В[сc])[, -]*)+)[ сc:]+(\d\d?[:.]\d\d)[- до]+(\d\d[.:]\d\d)', re.I)
data = []
for block in blocks:
if block.get('class') == 'mark-box':
@ -51,7 +73,15 @@ def dataset(fileobj):
if m:
lat = float(m.group(1))
lon = float(m.group(2))
logging.debug('Found title: %s, website: %s, address: %s, coords: %s, %s', title, website, addr, lat, lon)
opening_hours = []
# Extract opening hours
oh = ' '.join(li.xpath("p/text()"))
for m in RE_OH.finditer(oh):
weekdays = parse_weekdays(m.group(1))
if weekdays is not None:
opening_hours.append('{}{:0>5s}-{:0>5s}'.format(
weekdays + ' ' if weekdays else '', m.group(2).replace('.', ':'), m.group(3).replace('.', ':')))
logging.debug('Found title: %s, website: %s, opens: %s, coords: %s, %s', title, website, '; '.join(opening_hours) or None, lat, lon)
if lat is not None and name is not None:
tags = {
'name': name,
@ -59,6 +89,7 @@ def dataset(fileobj):
'shop': 'supermarket',
'phone': '8-800-700-5-800',
'operator': 'ООО «АШАН»',
'opening_hours': '; '.join(opening_hours),
'addr:full': addr,
'website': website
}