Added experimental articles geocoding scripts

This commit is contained in:
Alexander Zolotarev 2014-05-08 22:28:42 +07:00
parent 155ab9ddb1
commit ba3667f65f
2 changed files with 177 additions and 0 deletions

134
geocoder/geocode-1.py Normal file
View file

@ -0,0 +1,134 @@
# -*- coding: utf-8 -*-
import sys
import os
import shutil
import unicodedata
import urllib
import pprint
import json
import hashlib
try:
from bs4 import BeautifulSoup
except ImportError:
from BeautifulSoup import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf-8')
"""
page_address = {
"countrycodes": "CH",
"country": "Switzerland",
"city": "Zurich"
}
soup = BeautifulSoup(open("Zurich.html"))
"""
page_address = {
"countrycodes": "GB",
"country": "United Kingdom",
"city": "London"
}
soup = BeautifulSoup(open("London.html"))
page_address = {
"countrycodes": "RU",
"country": "Russia",
"city": "Murmansk"
}
soup = BeautifulSoup(open("Murmansk.html"))
page_address = {
"countrycodes": "JP",
"country": "Japan",
"city": "Tokyo"
}
soup = BeautifulSoup(open("Tokyo.html"))
def nominatim_geocode(address):
try:
os.makedirs("nominatim")
except OSError:
pass
key = os.path.join("nominatim", hashlib.md5(json.dumps(address)).hexdigest())
if os.path.exists(key):
val = open(key).read()
else:
val = urllib.urlopen("http://open.mapquestapi.com/nominatim/v1/search.php?format=json&accept-language=en" + urllib.urlencode(address)).read()
open(key, "w").write(val)
try:
val = json.loads(val)
except ValueError:
val = ""
return val
def nominatim_bbox_transform(bbox):
return [float(bbox[2]), float(bbox[0]), float(bbox[3]), float(bbox[1])]
def geocode_bbox(string, city_bbox, place_params):
place_params["q"] = string
is_house = any(char.isdigit() for char in str(string))
try:
for place in nominatim_geocode(place_params):
if is_house and place["class"] == "highway":
continue
#print city_bbox
if not(float(place["lon"]) < city_bbox[0] or float(place["lon"]) > city_bbox[2] or float(place["lat"]) < city_bbox[1] or float(place["lat"]) > city_bbox[3]):
print place
break
else:
raise IndexError
# print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place["lat"], place["lon"], i)
except IndexError:
return False
return (float(place["lon"]), float(place["lat"]))
#pprint.pprint(texts_to_geocode)
#print len(texts_to_geocode)
city_geocode = nominatim_geocode(page_address)[0]
city_bbox = nominatim_bbox_transform(city_geocode["boundingbox"])
place_params = {"viewbox": ",".join([str(x) for x in city_bbox]), "countrycodes": page_address["countrycodes"]}
texts_to_geocode = set()
for s in soup.findAll("b") + soup.findAll("span", "label listing-address"):
coord = geocode_bbox(s.getText(), city_bbox, place_params)
if coord:
print s.getText(), coord
hrefLink = u"mapswithme://map?v=1&ll=%s,%s" % (coord[1], coord[0])
hrefLink = "http://www.openstreetmap.org/?mlat=%s&mlon=%s#map=19/%s/%s" % (coord[1], coord[0], coord[1], coord[0])
mapTag = soup.new_tag("a", href=hrefLink)
mapTag["class"] = "geolink"
mapTag.string = "[map]"
s.append(mapTag)
# s["style"] = "color: red;"
else:
print s.getText(), "BAD!!!"
open('out.html', 'w').write(str(soup))
osm = open("file.osm", "w")
print >> osm, '<osm version="0.6">'
id = 0
bad_cnt = 0
for i in texts_to_geocode:
id += 1
place_params["q"] = i
try:
for place in nominatim_geocode(place_params):
print place
print city_bbox
if not(float(place["lon"]) < city_bbox[0] or float(place["lon"]) > city_bbox[2] or float(place["lat"]) < city_bbox[1] or float(place["lat"]) > city_bbox[3]):
break
else:
raise IndexError
print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place["lat"], place["lon"], i)
#print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place_params["viewbox"].split(",")[0], place_params["viewbox"].split(",")[1], i)
#id += 1
#print >> osm, '<node id="%s" lat="%s" lon="%s" version="1"><tag k="name" v="%s" /></node>'%(id, place_params["viewbox"].split(",")[2], place_params["viewbox"].split(",")[3], i)
except IndexError:
bad_cnt += 1
print bad_cnt, i
print >> osm, '</osm>'

43
geocoder/geocode.py Normal file
View file

@ -0,0 +1,43 @@
import sys
import os
import json
import urllib
reload(sys)
sys.setdefaultencoding("utf-8")
if sys.argv[1] == '1':
strings = [[l.replace("_"," ") for l in l.strip().split()] for l in open("geocodes_todo.txt")]
#print strings
for string in strings:
print "wget 'http://nominatim.openstreetmap.org/search?"+ urllib.urlencode({'city':string[1], 'country':string[2], 'format': 'json'}) + "' -O dl/" + string[0] + ".json"
elif sys.argv[1] == '2':
files = os.listdir('dl/')
for file in files:
try:
c = json.loads(open(os.path.join('dl', file)).read())
if c:
print file.split('.')[0]+'\t'+ c[0]["lat"]+'\t'+c[0]["lon"]
except ValueError:
pass
elif sys.argv[1] == '3':
strings = [[l.replace("_"," ").replace("/", " ").replace("(", " ").replace(")", " ") for l in l.strip().split()] for l in open("geocodes_todo.txt")]
files = os.listdir('dl/')
bad_ids = set()
for file in files:
try:
c = json.loads(open(os.path.join('dl', file)).read())
if c:
continue
except ValueError:
pass
bad_ids.add(file.split('.')[0])
for string in strings:
if string[0] in bad_ids:
print "wget 'http://nominatim.openstreetmap.org/search?"+ urllib.urlencode({'q': string[1]+" "+string[2], 'format': 'json'}) + "' -O dl2/" + string[0] + ".json"
#print string[0], string[1], string[2]