Another example with HTML parsing

This commit is contained in:
Ilya Zverev 2017-02-16 14:19:48 +03:00
parent cd8c9b6e2f
commit a2cdbf3773
2 changed files with 80 additions and 4 deletions

View file

@ -216,7 +216,13 @@ class OsmConflator:
logging.debug('Overpass query: %s', query)
r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query})
if r.status_code != 200:
raise IOError('Failed to download data from Overpass API: {} {}\nQuery: {}'.format(r.status_code, r.text, query))
logging.error('Failed to download data from Overpass API: %s', r.status_code)
logging.error('Query: %s', query)
logging.error('Error message: %s', r.text)
if 'rate_limited' in r.text:
r = requests.get(OVERPASS_SERVER + 'status')
logging.warning('Seems like you are rate limited. API status:\n%s', r.text)
raise IOError()
for el in r.json()['elements']:
if 'tags' not in el:
continue
@ -299,8 +305,12 @@ class OsmConflator:
changed = False
for k, v in sp.tags.items():
if k not in p.tags or (k in master_tags and p.tags[k] != v):
p.tags[k] = v
changed = True
if v is not None:
p.tags[k] = v
changed = True
elif k in p.tags:
del p.tags[k]
changed = True
if changed:
p.action = 'modify'
# If not, action is None and we're not including this object into the osmChange
@ -309,7 +319,7 @@ class OsmConflator:
if self.ref is not None:
p.tags[self.ref] = sp.id
elif keep or p.is_area():
if not retag:
if retag is None:
retag = {'fixme': DELETED_FIXME}
for k, v in retag.items():
if v is not None:

66
profiles/auchan_moscow.py Normal file
View file

@ -0,0 +1,66 @@
# This profile requires lxml package
import logging
import re
# A web page with a list of shops in Moscow. You can replace it with one for another city
download_url = 'https://www.auchan.ru/ru/moscow/'
source = 'auchan.ru'
# Not adding a ref:auchan tag, since we don't have good identifiers
no_dataset_id = True
# Using a name query with regular expressions
query = [('shop', 'supermarket'), ('name', '~Ашан|АШАН')]
# We don't parse opening hours yet, but it'd be cool if we did
master_tags = set(('name', 'opening_hours', 'phone'))
# Empty dict so we don't add a fixme tag to unmatched objects
tag_unmatched = {}
# Coordinates are VERY approximate, so increasing max distance to ~1 km
max_distance = 0.01
# For some reason, functions here cannot use variables defined above
# And defining them as "global" moves these from locals() to globals()
download_url_copy = download_url
def dataset(fileobj):
# We are parsing HTML, and for that we need an lxml package
from lxml import html
global download_url_copy
h = html.fromstring(fileobj.read().decode('utf-8'))
shops = h.find_class('shops-in-the-city-holder')[0]
shops.make_links_absolute(download_url_copy)
blocks = shops.xpath("//div[@class='mark-box'] | //ul[@class='shops-list']")
logging.debug('Found %s blocks', len(blocks))
name = None
RE_GMAPS = re.compile(r'q=(-?[0-9.]+)\+(-?[0-9.]+)$')
data = []
for block in blocks:
if block.get('class') == 'mark-box':
name = block.xpath("strong[contains(@class, 'name')]/text()")[0].replace('АШАН', 'Ашан')
logging.debug('Name: %s', name)
elif block.get('class') == 'shops-list':
for li in block:
title = li.xpath("strong[@class='title']/a/text()")
title = title[0].lower() if title else None
website = li.xpath("strong[@class='title']/a/@href")
website = website[0] if website else None
addr = li.xpath("p[1]/text()")
addr = addr[0].strip() if addr else None
lat = None
lon = None
gmapslink = li.xpath(".//a[contains(@href, 'maps.google')]/@href")
if gmapslink:
m = RE_GMAPS.search(gmapslink[0])
if m:
lat = float(m.group(1))
lon = float(m.group(2))
logging.debug('Found title: %s, website: %s, address: %s, coords: %s, %s', title, website, addr, lat, lon)
if lat is not None and name is not None:
tags = {
'name': name,
'brand': 'Auchan',
'shop': 'supermarket',
'phone': '8-800-700-5-800',
'operator': 'ООО «АШАН»',
'addr:full': addr,
'website': website
}
data.append(SourcePoint(title, lat, lon, tags))
return data