Example for using param in minkult.py, and remove imports for profiles

2018-01-16 17:04:47 +03:00 · 2018-01-16 17:04:47 +03:00 · 79c7ab80ce
commit 79c7ab80ce
parent 7d0a631874
6 changed files with 32 additions and 14 deletions
--- a/conflate/conflate.py
+++ b/conflate/conflate.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import codecs
+import json
 import kdtree
 import logging
 import math
@ -8,10 +9,6 @@ import requests
 import os
 import sys
 from io import BytesIO
-import json    # for profiles
-import re      # for profiles
-import zipfile # for profiles
-from collections import defaultdict # for profiles
 try:
    from .version import __version__
 except ImportError:
--- a/profiles/auchan_moscow.py
+++ b/profiles/auchan_moscow.py
@ -44,7 +44,7 @@ def dataset(fileobj):

    # We are parsing HTML, and for that we need an lxml package
    from lxml import html
-    global download_url_copy
+    global download_url_copy, re
    h = html.fromstring(fileobj.read().decode('utf-8'))
    shops = h.find_class('shops-in-the-city-holder')[0]
    shops.make_links_absolute(download_url_copy)
--- a/profiles/burgerking.py
+++ b/profiles/burgerking.py
@ -1,3 +1,7 @@
+# Note: the json file at the burgerking website was restructured
+# and does not contain any useful data now.
+# So this profile is here solely for demonstration purposes.
+
 import json
 import codecs
 import re
@ -20,6 +24,7 @@ tag_unmatched = {

 def dataset(fileobj):
    def parse_hours(s):
+        global re
        s = re.sub('^зал:? *', '', s.lower())
        s = s.replace('<br />', ';').replace('<br>', ';').replace('\n', ';').replace(' ', '').replace(',', ';').replace('–', '-')
        s = s.replace('-00:', '-24:')
@ -66,7 +71,11 @@ def dataset(fileobj):
        346: 'Передвинуть к кафе',

    }
-    source = json.load(codecs.getreader('utf-8')(fileobj))
+    json_src = codecs.getreader('utf-8')(fileobj).read()
+    p = json_src.find('<div')
+    if p > 0:
+        json_src = json_src[:p]
+    source = json.loads(json_src)
    data = []
    for el in source:
        gid = int(el['origID'])
--- a/profiles/minkult.py
+++ b/profiles/minkult.py
@ -5,7 +5,8 @@ import codecs


 # Reading the dataset passport to determine an URL of the latest dataset version
-def download_url(dataset_id='7705851331-museums'):
+def download_url():
+    dataset_id = '7705851331-' + (param or 'museums')
    r = requests.get('http://opendata.mkrf.ru/opendata/{}/meta.json'.format(dataset_id))
    if r.status_code != 200 or len(r.content) == 0:
        logging.error('Could not get URL for dataset: %s %s', r.status_code, r.text)
@ -17,8 +18,18 @@ def download_url(dataset_id='7705851331-museums'):
    return latest['source']

 source = 'opendata.mkrf.ru'
-dataset_id = 'mkrf_museums'
-query = [('tourism', 'museum')]
+dataset_id = 'mkrf_'+(param or 'museums')
+if not param or param == 'museums':
+    query = [('tourism', 'museum')]
+elif param == 'theaters':
+    query = [('amenity', 'theatre')]
+elif param == 'circuses':
+    query = [('amenity', 'circus')]
+elif param == 'philharmonic':
+    query = [('amenity', 'theatre')]
+else:
+    raise ValueError('Unknown param value: {}'.format(param))
+
 max_distance = 300
 master_tags = ('official_name', 'phone', 'opening_hours', 'website')

--- a/profiles/moscow_parkomats.py
+++ b/profiles/moscow_parkomats.py
@ -1,12 +1,10 @@
-# Available modules: codecs, logging, requests, json, re, etree. But importing these helps catch other errors
+# Available modules: codecs, logging, requests, json, etree. But importing these helps catch other errors
 import json
-import re
 import logging
-import requests
-import zipfile


 def download_url(mos_dataset_id=1421):
+    import requests
    r = requests.get('https://data.mos.ru/api/datasets/expformats/?datasetId={}'.format(mos_dataset_id))
    if r.status_code != 200 or len(r.content) == 0:
        logging.error('Could not get URL for dataset: %s %s', r.status_code, r.text)
@ -15,7 +13,7 @@ def download_url(mos_dataset_id=1421):
    url = [x for x in r.json() if x['Format'] == 'json'][0]
    version = '?'
    title = 'dataset'
-    r = requests.get('https://data.mos.ru/apiproxy/opendata/1421/meta.json'.format(mos_dataset_id))
+    r = requests.get('https://data.mos.ru/apiproxy/opendata/{}/meta.json'.format(mos_dataset_id))
    if r.status_code == 200:
        title = r.json()['Title']
        version = r.json()['VersionNumber']
@ -50,6 +48,8 @@ master_tags = ('zone:parking', 'ref', 'contact:phone', 'contact:website', 'opera

 # A list of SourcePoint objects. Initialize with (id, lat, lon, {tags}).
 def dataset(fileobj):
+    import zipfile
+    import re
    zf = zipfile.ZipFile(fileobj)
    source = json.loads(zf.read(zf.namelist()[0]).decode('cp1251'))
    RE_NUM4 = re.compile(r'\d{4,6}')
--- a/profiles/navads_shell_json.py
+++ b/profiles/navads_shell_json.py
@ -56,6 +56,7 @@ def dataset(fileobj):
            return '24/7'
        return '; '.join(res).replace('23:59', '24:00')

+    global re, defaultdict
    source = json.load(codecs.getreader('utf-8-sig')(fileobj))
    data = []
    for el in source['Locations']: