Started working on geocoding points
This commit is contained in:
parent
862c61cde6
commit
65d9cfd9dd
3 changed files with 101 additions and 1 deletions
|
@ -8,6 +8,7 @@ import math
|
||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import struct
|
||||||
import sys
|
import sys
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
@ -42,6 +43,7 @@ class SourcePoint:
|
||||||
self.category = category
|
self.category = category
|
||||||
self.dist_offset = 0
|
self.dist_offset = 0
|
||||||
self.remarks = remarks
|
self.remarks = remarks
|
||||||
|
self.region = None
|
||||||
self.exclusive_group = None
|
self.exclusive_group = None
|
||||||
|
|
||||||
def distance(self, other):
|
def distance(self, other):
|
||||||
|
@ -1120,6 +1122,100 @@ def check_dataset_for_duplicates(profile, dataset, print_all=False):
|
||||||
raise KeyError('Cannot continue with duplicate ids')
|
raise KeyError('Cannot continue with duplicate ids')
|
||||||
|
|
||||||
|
|
||||||
|
def init_geocoder():
|
||||||
|
class PlacePoint:
|
||||||
|
def __init__(self, lon, lat, country, region):
|
||||||
|
self.coord = (lon, lat)
|
||||||
|
self.country = country
|
||||||
|
self.region = region
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.coord)
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
return self.coord[i]
|
||||||
|
|
||||||
|
filename = os.path.join(os.getcwd(), os.path.dirname(__file__), 'places.bin')
|
||||||
|
if not os.path.exists(filename):
|
||||||
|
return None
|
||||||
|
places = []
|
||||||
|
with open(filename, 'rb') as f:
|
||||||
|
countries = []
|
||||||
|
cnt = struct.unpack('B', f.read(1))[0]
|
||||||
|
for i in range(cnt):
|
||||||
|
countries.append(struct.unpack('2s', f.read(2))[0].decode('ascii'))
|
||||||
|
regions = []
|
||||||
|
cnt = struct.unpack('h', f.read(2))[0]
|
||||||
|
for i in range(cnt):
|
||||||
|
l = struct.unpack('B', f.read(1))[0]
|
||||||
|
regions.append(f.read(l).decode('ascii'))
|
||||||
|
dlon = f.read(3)
|
||||||
|
while len(dlon) == 3:
|
||||||
|
dlat = f.read(3)
|
||||||
|
country = struct.unpack('B', f.read(1))[0]
|
||||||
|
region = struct.unpack('h', f.read(2))[0]
|
||||||
|
places.append(PlacePoint(struct.unpack('<l', dlon + b'\0')[0] / 10000,
|
||||||
|
struct.unpack('<l', dlat + b'\0')[0] / 10000,
|
||||||
|
countries[country], regions[region]))
|
||||||
|
if not places:
|
||||||
|
return None
|
||||||
|
return kdtree.create(places)
|
||||||
|
|
||||||
|
|
||||||
|
def add_regions(profile, dataset, opt_regions):
|
||||||
|
regions = profile.get_raw('regions')
|
||||||
|
if not regions:
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info('Geocoding regions')
|
||||||
|
if not callable(regions):
|
||||||
|
if regions is True or regions == 4:
|
||||||
|
regions = 'all'
|
||||||
|
elif regions is False or regions == 2:
|
||||||
|
regions = []
|
||||||
|
if isinstance(regions, str):
|
||||||
|
regions = regions.lower()
|
||||||
|
if regions[:3] == 'reg' or '4' in regions:
|
||||||
|
regions = 'all'
|
||||||
|
elif regions[:3] == 'cou' or '2' in regions:
|
||||||
|
regions = []
|
||||||
|
elif regions == 'some':
|
||||||
|
regions = ['US', 'RU']
|
||||||
|
if isinstance(regions, list):
|
||||||
|
for i in regions:
|
||||||
|
regions[i] = regions[i].upper()
|
||||||
|
|
||||||
|
# Finally, geocode
|
||||||
|
places = init_geocoder()
|
||||||
|
if not places:
|
||||||
|
if callable(regions):
|
||||||
|
logging.warn('Could not find the geocoding file')
|
||||||
|
for d in dataset:
|
||||||
|
d.region = regions(d)
|
||||||
|
else:
|
||||||
|
logging.error('Could not find the geocoding file, no regions were added')
|
||||||
|
return
|
||||||
|
|
||||||
|
for d in dataset:
|
||||||
|
reg, _ = places.search_nn((d.lon, d.lat))
|
||||||
|
if callable(regions):
|
||||||
|
d.region = regions(d, reg.data.region)
|
||||||
|
elif regions == 'all' or reg.data.country in regions:
|
||||||
|
d.region = reg.data.region
|
||||||
|
else:
|
||||||
|
d.region = reg.data.country
|
||||||
|
|
||||||
|
# Filter regions
|
||||||
|
if opt_regions:
|
||||||
|
negate = opt_regions[0] in ('-', '^')
|
||||||
|
if negate:
|
||||||
|
opt_regions = opt_regions[1:]
|
||||||
|
filtr = set([r.strip().upper() for r in opt_regions.split(',')])
|
||||||
|
for i in reversed(range(len(dataset))):
|
||||||
|
if negate != (dataset[i].region not in filtr):
|
||||||
|
del dataset[i]
|
||||||
|
|
||||||
|
|
||||||
def write_for_filter(profile, dataset, f):
|
def write_for_filter(profile, dataset, f):
|
||||||
def query_to_tag_strings(query):
|
def query_to_tag_strings(query):
|
||||||
if isinstance(query, str):
|
if isinstance(query, str):
|
||||||
|
@ -1199,6 +1295,8 @@ def run(profile=None):
|
||||||
help='Prepare a file for the filtering script')
|
help='Prepare a file for the filtering script')
|
||||||
parser.add_argument('-d', '--list_duplicates', action='store_true',
|
parser.add_argument('-d', '--list_duplicates', action='store_true',
|
||||||
help='List all duplicate points in the dataset')
|
help='List all duplicate points in the dataset')
|
||||||
|
parser.add_argument('-r', '--regions',
|
||||||
|
help='Conflate only points with regions in this comma-separated list')
|
||||||
parser.add_argument('-v', '--verbose', action='store_true',
|
parser.add_argument('-v', '--verbose', action='store_true',
|
||||||
help='Display debug messages')
|
help='Display debug messages')
|
||||||
parser.add_argument('-q', '--quiet', action='store_true',
|
parser.add_argument('-q', '--quiet', action='store_true',
|
||||||
|
@ -1232,6 +1330,7 @@ def run(profile=None):
|
||||||
transform_dataset(profile, dataset)
|
transform_dataset(profile, dataset)
|
||||||
add_categories_to_dataset(profile, dataset)
|
add_categories_to_dataset(profile, dataset)
|
||||||
check_dataset_for_duplicates(profile, dataset, options.list_duplicates)
|
check_dataset_for_duplicates(profile, dataset, options.list_duplicates)
|
||||||
|
add_regions(profile, dataset, options.regions)
|
||||||
logging.info('Read %s items from the dataset', len(dataset))
|
logging.info('Read %s items from the dataset', len(dataset))
|
||||||
|
|
||||||
if options.for_filter:
|
if options.for_filter:
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
__version__ = '1.3.3'
|
__version__ = '1.4.0'
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -10,6 +10,7 @@ setup(
|
||||||
author='Ilya Zverev',
|
author='Ilya Zverev',
|
||||||
author_email='ilya@zverev.info',
|
author_email='ilya@zverev.info',
|
||||||
packages=['conflate'],
|
packages=['conflate'],
|
||||||
|
package_data={'conflate': ['places.bin']},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'kdtree',
|
'kdtree',
|
||||||
'requests',
|
'requests',
|
||||||
|
|
Loading…
Add table
Reference in a new issue