Added experimental articles geocoding scripts
This commit is contained in:
parent
155ab9ddb1
commit
ba3667f65f
2 changed files with 177 additions and 0 deletions
134
geocoder/geocode-1.py
Normal file
134
geocoder/geocode-1.py
Normal file
|
@ -0,0 +1,134 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import unicodedata
|
||||||
|
import urllib
|
||||||
|
import pprint
|
||||||
|
import json
|
||||||
|
import hashlib
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
except ImportError:
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
|
||||||
|
reload(sys)
|
||||||
|
sys.setdefaultencoding('utf-8')
|
||||||
|
"""
|
||||||
|
page_address = {
|
||||||
|
"countrycodes": "CH",
|
||||||
|
"country": "Switzerland",
|
||||||
|
"city": "Zurich"
|
||||||
|
}
|
||||||
|
soup = BeautifulSoup(open("Zurich.html"))
|
||||||
|
"""
|
||||||
|
|
||||||
|
page_address = {
|
||||||
|
"countrycodes": "GB",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"city": "London"
|
||||||
|
}
|
||||||
|
soup = BeautifulSoup(open("London.html"))
|
||||||
|
|
||||||
|
page_address = {
|
||||||
|
"countrycodes": "RU",
|
||||||
|
"country": "Russia",
|
||||||
|
"city": "Murmansk"
|
||||||
|
}
|
||||||
|
soup = BeautifulSoup(open("Murmansk.html"))
|
||||||
|
|
||||||
|
page_address = {
|
||||||
|
"countrycodes": "JP",
|
||||||
|
"country": "Japan",
|
||||||
|
"city": "Tokyo"
|
||||||
|
}
|
||||||
|
soup = BeautifulSoup(open("Tokyo.html"))
|
||||||
|
|
||||||
|
def nominatim_geocode(address):
|
||||||
|
try:
|
||||||
|
os.makedirs("nominatim")
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
key = os.path.join("nominatim", hashlib.md5(json.dumps(address)).hexdigest())
|
||||||
|
if os.path.exists(key):
|
||||||
|
val = open(key).read()
|
||||||
|
else:
|
||||||
|
val = urllib.urlopen("http://open.mapquestapi.com/nominatim/v1/search.php?format=json&accept-language=en" + urllib.urlencode(address)).read()
|
||||||
|
open(key, "w").write(val)
|
||||||
|
try:
|
||||||
|
val = json.loads(val)
|
||||||
|
except ValueError:
|
||||||
|
val = ""
|
||||||
|
return val
|
||||||
|
|
||||||
|
def nominatim_bbox_transform(bbox):
|
||||||
|
return [float(bbox[2]), float(bbox[0]), float(bbox[3]), float(bbox[1])]
|
||||||
|
|
||||||
|
def geocode_bbox(string, city_bbox, place_params):
|
||||||
|
place_params["q"] = string
|
||||||
|
is_house = any(char.isdigit() for char in str(string))
|
||||||
|
try:
|
||||||
|
for place in nominatim_geocode(place_params):
|
||||||
|
if is_house and place["class"] == "highway":
|
||||||
|
continue
|
||||||
|
#print city_bbox
|
||||||
|
if not(float(place["lon"]) < city_bbox[0] or float(place["lon"]) > city_bbox[2] or float(place["lat"]) < city_bbox[1] or float(place["lat"]) > city_bbox[3]):
|
||||||
|
print place
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise IndexError
|
||||||
|
# print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place["lat"], place["lon"], i)
|
||||||
|
except IndexError:
|
||||||
|
return False
|
||||||
|
return (float(place["lon"]), float(place["lat"]))
|
||||||
|
|
||||||
|
#pprint.pprint(texts_to_geocode)
|
||||||
|
#print len(texts_to_geocode)
|
||||||
|
|
||||||
|
city_geocode = nominatim_geocode(page_address)[0]
|
||||||
|
city_bbox = nominatim_bbox_transform(city_geocode["boundingbox"])
|
||||||
|
place_params = {"viewbox": ",".join([str(x) for x in city_bbox]), "countrycodes": page_address["countrycodes"]}
|
||||||
|
texts_to_geocode = set()
|
||||||
|
|
||||||
|
for s in soup.findAll("b") + soup.findAll("span", "label listing-address"):
|
||||||
|
coord = geocode_bbox(s.getText(), city_bbox, place_params)
|
||||||
|
if coord:
|
||||||
|
print s.getText(), coord
|
||||||
|
hrefLink = u"mapswithme://map?v=1&ll=%s,%s" % (coord[1], coord[0])
|
||||||
|
hrefLink = "http://www.openstreetmap.org/?mlat=%s&mlon=%s#map=19/%s/%s" % (coord[1], coord[0], coord[1], coord[0])
|
||||||
|
mapTag = soup.new_tag("a", href=hrefLink)
|
||||||
|
mapTag["class"] = "geolink"
|
||||||
|
mapTag.string = "[map]"
|
||||||
|
s.append(mapTag)
|
||||||
|
# s["style"] = "color: red;"
|
||||||
|
else:
|
||||||
|
print s.getText(), "BAD!!!"
|
||||||
|
|
||||||
|
open('out.html', 'w').write(str(soup))
|
||||||
|
|
||||||
|
|
||||||
|
osm = open("file.osm", "w")
|
||||||
|
print >> osm, '<osm version="0.6">'
|
||||||
|
id = 0
|
||||||
|
bad_cnt = 0
|
||||||
|
for i in texts_to_geocode:
|
||||||
|
id += 1
|
||||||
|
place_params["q"] = i
|
||||||
|
try:
|
||||||
|
for place in nominatim_geocode(place_params):
|
||||||
|
print place
|
||||||
|
print city_bbox
|
||||||
|
if not(float(place["lon"]) < city_bbox[0] or float(place["lon"]) > city_bbox[2] or float(place["lat"]) < city_bbox[1] or float(place["lat"]) > city_bbox[3]):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise IndexError
|
||||||
|
print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place["lat"], place["lon"], i)
|
||||||
|
|
||||||
|
#print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place_params["viewbox"].split(",")[0], place_params["viewbox"].split(",")[1], i)
|
||||||
|
#id += 1
|
||||||
|
#print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place_params["viewbox"].split(",")[2], place_params["viewbox"].split(",")[3], i)
|
||||||
|
except IndexError:
|
||||||
|
bad_cnt += 1
|
||||||
|
print bad_cnt, i
|
||||||
|
|
||||||
|
print >> osm, '</osm>'
|
43
geocoder/geocode.py
Normal file
43
geocoder/geocode.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
|
||||||
|
reload(sys)
|
||||||
|
sys.setdefaultencoding("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if sys.argv[1] == '1':
|
||||||
|
strings = [[l.replace("_"," ") for l in l.strip().split()] for l in open("geocodes_todo.txt")]
|
||||||
|
#print strings
|
||||||
|
for string in strings:
|
||||||
|
print "wget 'http://nominatim.openstreetmap.org/search?"+ urllib.urlencode({'city':string[1], 'country':string[2], 'format': 'json'}) + "' -O dl/" + string[0] + ".json"
|
||||||
|
|
||||||
|
elif sys.argv[1] == '2':
|
||||||
|
files = os.listdir('dl/')
|
||||||
|
for file in files:
|
||||||
|
try:
|
||||||
|
c = json.loads(open(os.path.join('dl', file)).read())
|
||||||
|
if c:
|
||||||
|
print file.split('.')[0]+'\t'+ c[0]["lat"]+'\t'+c[0]["lon"]
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif sys.argv[1] == '3':
|
||||||
|
strings = [[l.replace("_"," ").replace("/", " ").replace("(", " ").replace(")", " ") for l in l.strip().split()] for l in open("geocodes_todo.txt")]
|
||||||
|
files = os.listdir('dl/')
|
||||||
|
bad_ids = set()
|
||||||
|
for file in files:
|
||||||
|
try:
|
||||||
|
c = json.loads(open(os.path.join('dl', file)).read())
|
||||||
|
if c:
|
||||||
|
continue
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
bad_ids.add(file.split('.')[0])
|
||||||
|
for string in strings:
|
||||||
|
if string[0] in bad_ids:
|
||||||
|
print "wget 'http://nominatim.openstreetmap.org/search?"+ urllib.urlencode({'q': string[1]+" "+string[2], 'format': 'json'}) + "' -O dl2/" + string[0] + ".json"
|
||||||
|
#print string[0], string[1], string[2]
|
Reference in a new issue