Added experimental articles geocoding scripts
This commit is contained in:
parent
155ab9ddb1
commit
ba3667f65f
2 changed files with 177 additions and 0 deletions
134
geocoder/geocode-1.py
Normal file
134
geocoder/geocode-1.py
Normal file
|
@ -0,0 +1,134 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
import unicodedata
|
||||
import urllib
|
||||
import pprint
|
||||
import json
|
||||
import hashlib
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError:
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
"""
|
||||
page_address = {
|
||||
"countrycodes": "CH",
|
||||
"country": "Switzerland",
|
||||
"city": "Zurich"
|
||||
}
|
||||
soup = BeautifulSoup(open("Zurich.html"))
|
||||
"""
|
||||
|
||||
page_address = {
|
||||
"countrycodes": "GB",
|
||||
"country": "United Kingdom",
|
||||
"city": "London"
|
||||
}
|
||||
soup = BeautifulSoup(open("London.html"))
|
||||
|
||||
page_address = {
|
||||
"countrycodes": "RU",
|
||||
"country": "Russia",
|
||||
"city": "Murmansk"
|
||||
}
|
||||
soup = BeautifulSoup(open("Murmansk.html"))
|
||||
|
||||
page_address = {
|
||||
"countrycodes": "JP",
|
||||
"country": "Japan",
|
||||
"city": "Tokyo"
|
||||
}
|
||||
soup = BeautifulSoup(open("Tokyo.html"))
|
||||
|
||||
def nominatim_geocode(address):
|
||||
try:
|
||||
os.makedirs("nominatim")
|
||||
except OSError:
|
||||
pass
|
||||
key = os.path.join("nominatim", hashlib.md5(json.dumps(address)).hexdigest())
|
||||
if os.path.exists(key):
|
||||
val = open(key).read()
|
||||
else:
|
||||
val = urllib.urlopen("http://open.mapquestapi.com/nominatim/v1/search.php?format=json&accept-language=en" + urllib.urlencode(address)).read()
|
||||
open(key, "w").write(val)
|
||||
try:
|
||||
val = json.loads(val)
|
||||
except ValueError:
|
||||
val = ""
|
||||
return val
|
||||
|
||||
def nominatim_bbox_transform(bbox):
|
||||
return [float(bbox[2]), float(bbox[0]), float(bbox[3]), float(bbox[1])]
|
||||
|
||||
def geocode_bbox(string, city_bbox, place_params):
|
||||
place_params["q"] = string
|
||||
is_house = any(char.isdigit() for char in str(string))
|
||||
try:
|
||||
for place in nominatim_geocode(place_params):
|
||||
if is_house and place["class"] == "highway":
|
||||
continue
|
||||
#print city_bbox
|
||||
if not(float(place["lon"]) < city_bbox[0] or float(place["lon"]) > city_bbox[2] or float(place["lat"]) < city_bbox[1] or float(place["lat"]) > city_bbox[3]):
|
||||
print place
|
||||
break
|
||||
else:
|
||||
raise IndexError
|
||||
# print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place["lat"], place["lon"], i)
|
||||
except IndexError:
|
||||
return False
|
||||
return (float(place["lon"]), float(place["lat"]))
|
||||
|
||||
#pprint.pprint(texts_to_geocode)
|
||||
#print len(texts_to_geocode)
|
||||
|
||||
city_geocode = nominatim_geocode(page_address)[0]
|
||||
city_bbox = nominatim_bbox_transform(city_geocode["boundingbox"])
|
||||
place_params = {"viewbox": ",".join([str(x) for x in city_bbox]), "countrycodes": page_address["countrycodes"]}
|
||||
texts_to_geocode = set()
|
||||
|
||||
for s in soup.findAll("b") + soup.findAll("span", "label listing-address"):
|
||||
coord = geocode_bbox(s.getText(), city_bbox, place_params)
|
||||
if coord:
|
||||
print s.getText(), coord
|
||||
hrefLink = u"mapswithme://map?v=1&ll=%s,%s" % (coord[1], coord[0])
|
||||
hrefLink = "http://www.openstreetmap.org/?mlat=%s&mlon=%s#map=19/%s/%s" % (coord[1], coord[0], coord[1], coord[0])
|
||||
mapTag = soup.new_tag("a", href=hrefLink)
|
||||
mapTag["class"] = "geolink"
|
||||
mapTag.string = "[map]"
|
||||
s.append(mapTag)
|
||||
# s["style"] = "color: red;"
|
||||
else:
|
||||
print s.getText(), "BAD!!!"
|
||||
|
||||
open('out.html', 'w').write(str(soup))
|
||||
|
||||
|
||||
osm = open("file.osm", "w")
|
||||
print >> osm, '<osm version="0.6">'
|
||||
id = 0
|
||||
bad_cnt = 0
|
||||
for i in texts_to_geocode:
|
||||
id += 1
|
||||
place_params["q"] = i
|
||||
try:
|
||||
for place in nominatim_geocode(place_params):
|
||||
print place
|
||||
print city_bbox
|
||||
if not(float(place["lon"]) < city_bbox[0] or float(place["lon"]) > city_bbox[2] or float(place["lat"]) < city_bbox[1] or float(place["lat"]) > city_bbox[3]):
|
||||
break
|
||||
else:
|
||||
raise IndexError
|
||||
print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place["lat"], place["lon"], i)
|
||||
|
||||
#print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place_params["viewbox"].split(",")[0], place_params["viewbox"].split(",")[1], i)
|
||||
#id += 1
|
||||
#print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place_params["viewbox"].split(",")[2], place_params["viewbox"].split(",")[3], i)
|
||||
except IndexError:
|
||||
bad_cnt += 1
|
||||
print bad_cnt, i
|
||||
|
||||
print >> osm, '</osm>'
|
43
geocoder/geocode.py
Normal file
43
geocoder/geocode.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
import sys
|
||||
import os
|
||||
import json
|
||||
import urllib
|
||||
|
||||
|
||||
reload(sys)
|
||||
sys.setdefaultencoding("utf-8")
|
||||
|
||||
|
||||
|
||||
if sys.argv[1] == '1':
|
||||
strings = [[l.replace("_"," ") for l in l.strip().split()] for l in open("geocodes_todo.txt")]
|
||||
#print strings
|
||||
for string in strings:
|
||||
print "wget 'http://nominatim.openstreetmap.org/search?"+ urllib.urlencode({'city':string[1], 'country':string[2], 'format': 'json'}) + "' -O dl/" + string[0] + ".json"
|
||||
|
||||
elif sys.argv[1] == '2':
|
||||
files = os.listdir('dl/')
|
||||
for file in files:
|
||||
try:
|
||||
c = json.loads(open(os.path.join('dl', file)).read())
|
||||
if c:
|
||||
print file.split('.')[0]+'\t'+ c[0]["lat"]+'\t'+c[0]["lon"]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
elif sys.argv[1] == '3':
|
||||
strings = [[l.replace("_"," ").replace("/", " ").replace("(", " ").replace(")", " ") for l in l.strip().split()] for l in open("geocodes_todo.txt")]
|
||||
files = os.listdir('dl/')
|
||||
bad_ids = set()
|
||||
for file in files:
|
||||
try:
|
||||
c = json.loads(open(os.path.join('dl', file)).read())
|
||||
if c:
|
||||
continue
|
||||
except ValueError:
|
||||
pass
|
||||
bad_ids.add(file.split('.')[0])
|
||||
for string in strings:
|
||||
if string[0] in bad_ids:
|
||||
print "wget 'http://nominatim.openstreetmap.org/search?"+ urllib.urlencode({'q': string[1]+" "+string[2], 'format': 'json'}) + "' -O dl2/" + string[0] + ".json"
|
||||
#print string[0], string[1], string[2]
|
Reference in a new issue