Another example with HTML parsing
This commit is contained in:
parent
cd8c9b6e2f
commit
a2cdbf3773
2 changed files with 80 additions and 4 deletions
18
conflate.py
18
conflate.py
|
@ -216,7 +216,13 @@ class OsmConflator:
|
|||
logging.debug('Overpass query: %s', query)
|
||||
r = requests.get(OVERPASS_SERVER + 'interpreter', {'data': query})
|
||||
if r.status_code != 200:
|
||||
raise IOError('Failed to download data from Overpass API: {} {}\nQuery: {}'.format(r.status_code, r.text, query))
|
||||
logging.error('Failed to download data from Overpass API: %s', r.status_code)
|
||||
logging.error('Query: %s', query)
|
||||
logging.error('Error message: %s', r.text)
|
||||
if 'rate_limited' in r.text:
|
||||
r = requests.get(OVERPASS_SERVER + 'status')
|
||||
logging.warning('Seems like you are rate limited. API status:\n%s', r.text)
|
||||
raise IOError()
|
||||
for el in r.json()['elements']:
|
||||
if 'tags' not in el:
|
||||
continue
|
||||
|
@ -299,8 +305,12 @@ class OsmConflator:
|
|||
changed = False
|
||||
for k, v in sp.tags.items():
|
||||
if k not in p.tags or (k in master_tags and p.tags[k] != v):
|
||||
p.tags[k] = v
|
||||
changed = True
|
||||
if v is not None:
|
||||
p.tags[k] = v
|
||||
changed = True
|
||||
elif k in p.tags:
|
||||
del p.tags[k]
|
||||
changed = True
|
||||
if changed:
|
||||
p.action = 'modify'
|
||||
# If not, action is None and we're not including this object into the osmChange
|
||||
|
@ -309,7 +319,7 @@ class OsmConflator:
|
|||
if self.ref is not None:
|
||||
p.tags[self.ref] = sp.id
|
||||
elif keep or p.is_area():
|
||||
if not retag:
|
||||
if retag is None:
|
||||
retag = {'fixme': DELETED_FIXME}
|
||||
for k, v in retag.items():
|
||||
if v is not None:
|
||||
|
|
66
profiles/auchan_moscow.py
Normal file
66
profiles/auchan_moscow.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
# This profile requires lxml package
|
||||
import logging
|
||||
import re
|
||||
|
||||
# A web page with a list of shops in Moscow. You can replace it with one for another city
|
||||
download_url = 'https://www.auchan.ru/ru/moscow/'
|
||||
source = 'auchan.ru'
|
||||
# Not adding a ref:auchan tag, since we don't have good identifiers
|
||||
no_dataset_id = True
|
||||
# Using a name query with regular expressions
|
||||
query = [('shop', 'supermarket'), ('name', '~Ашан|АШАН')]
|
||||
# We don't parse opening hours yet, but it'd be cool if we did
|
||||
master_tags = set(('name', 'opening_hours', 'phone'))
|
||||
# Empty dict so we don't add a fixme tag to unmatched objects
|
||||
tag_unmatched = {}
|
||||
# Coordinates are VERY approximate, so increasing max distance to ~1 km
|
||||
max_distance = 0.01
|
||||
|
||||
# For some reason, functions here cannot use variables defined above
|
||||
# And defining them as "global" moves these from locals() to globals()
|
||||
download_url_copy = download_url
|
||||
def dataset(fileobj):
|
||||
# We are parsing HTML, and for that we need an lxml package
|
||||
from lxml import html
|
||||
global download_url_copy
|
||||
h = html.fromstring(fileobj.read().decode('utf-8'))
|
||||
shops = h.find_class('shops-in-the-city-holder')[0]
|
||||
shops.make_links_absolute(download_url_copy)
|
||||
blocks = shops.xpath("//div[@class='mark-box'] | //ul[@class='shops-list']")
|
||||
logging.debug('Found %s blocks', len(blocks))
|
||||
name = None
|
||||
RE_GMAPS = re.compile(r'q=(-?[0-9.]+)\+(-?[0-9.]+)$')
|
||||
data = []
|
||||
for block in blocks:
|
||||
if block.get('class') == 'mark-box':
|
||||
name = block.xpath("strong[contains(@class, 'name')]/text()")[0].replace('АШАН', 'Ашан')
|
||||
logging.debug('Name: %s', name)
|
||||
elif block.get('class') == 'shops-list':
|
||||
for li in block:
|
||||
title = li.xpath("strong[@class='title']/a/text()")
|
||||
title = title[0].lower() if title else None
|
||||
website = li.xpath("strong[@class='title']/a/@href")
|
||||
website = website[0] if website else None
|
||||
addr = li.xpath("p[1]/text()")
|
||||
addr = addr[0].strip() if addr else None
|
||||
lat = None
|
||||
lon = None
|
||||
gmapslink = li.xpath(".//a[contains(@href, 'maps.google')]/@href")
|
||||
if gmapslink:
|
||||
m = RE_GMAPS.search(gmapslink[0])
|
||||
if m:
|
||||
lat = float(m.group(1))
|
||||
lon = float(m.group(2))
|
||||
logging.debug('Found title: %s, website: %s, address: %s, coords: %s, %s', title, website, addr, lat, lon)
|
||||
if lat is not None and name is not None:
|
||||
tags = {
|
||||
'name': name,
|
||||
'brand': 'Auchan',
|
||||
'shop': 'supermarket',
|
||||
'phone': '8-800-700-5-800',
|
||||
'operator': 'ООО «АШАН»',
|
||||
'addr:full': addr,
|
||||
'website': website
|
||||
}
|
||||
data.append(SourcePoint(title, lat, lon, tags))
|
||||
return data
|
Loading…
Add table
Reference in a new issue