forked from organicmaps/organicmaps
Add scrapelib.py and Wikipedia scraper.
This commit is contained in:
parent
0340de209d
commit
1f52586a6a
3 changed files with 84 additions and 0 deletions
38
crawler/scrapelib.py
Executable file
38
crawler/scrapelib.py
Executable file
|
@ -0,0 +1,38 @@
|
|||
#!/opt/local/bin/python
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
import time
|
||||
import urllib2
|
||||
|
||||
USER_AGENT="MapsWithMe Scraper <info@mapswithme.com>"
|
||||
|
||||
def ScrapeUrl(url, fileName, delay, info, maxTryCount = 5, userAgent = USER_AGENT):
|
||||
if os.path.exists(fileName):
|
||||
sys.stderr.write('Skipping existing {0} {1}\n'.format(info, fileName))
|
||||
else:
|
||||
sys.stderr.write('Downloading {0} {1} {2} \n'.format(info, fileName, url))
|
||||
|
||||
tryCount = 0
|
||||
while True:
|
||||
try:
|
||||
tryCount = tryCount + 1
|
||||
remoteFile = urllib2.urlopen(urllib2.Request(url, None, { "User-Agent" : userAgent }))
|
||||
|
||||
try:
|
||||
data = remoteFile.read()
|
||||
finally:
|
||||
remoteFile.close()
|
||||
break
|
||||
except IOError as error:
|
||||
sys.stderr.write('Try {0}, error: {1}\n'.format(tryCount, error))
|
||||
if tryCount >= maxTryCount:
|
||||
raise
|
||||
else:
|
||||
time.sleep(delay)
|
||||
|
||||
localFile = open(fileName, 'w')
|
||||
localFile.write(data)
|
||||
localFile.close()
|
||||
|
||||
time.sleep(delay)
|
17
crawler/wikipedia-crawler.sh
Executable file
17
crawler/wikipedia-crawler.sh
Executable file
|
@ -0,0 +1,17 @@
|
|||
#!/bin/bash
|
||||
set -e -u -x
|
||||
MY_PATH=`dirname $0`
|
||||
WIKI_LOCALE="en"
|
||||
NOW=$(date +"%Y-%m-%d-%H-%M")
|
||||
|
||||
wget http://wikilocation.org/mysqldumps/$WIKI_LOCALE.sql.gz
|
||||
|
||||
gunzip $WIKI_LOCALE.sql.gz
|
||||
|
||||
cat $WIKI_LOCALE.sql | grep "INSERT INTO" > $WIKI_LOCALE.inserts
|
||||
|
||||
cat $WIKI_LOCALE.inserts | sed 's/INSERT INTO .* VALUES (/[/g' | sed 's/),(/]\n[/g' | sed 's/);/]/g' | sed "s/','/\",\"/g" | sed "s/,'/,\"/g" | sed "s/']/\"]/g" | sed "s/\\\'/\\'/g" > $WIKI_LOCALE-pages.json
|
||||
|
||||
cat $WIKI_LOCALE-pages.json | python $MY_PATH/wikipedia-download-pages.py --locale=$WIKI_LOCALE --minlat=45.8 --maxlat=47.83 --minlon=5.93 --maxlon=10.54 2>&1 | tee wikipedia-download.log.$NOW
|
||||
|
||||
# TODO: Run publisher.
|
29
crawler/wikipedia-download-pages.py
Executable file
29
crawler/wikipedia-download-pages.py
Executable file
|
@ -0,0 +1,29 @@
|
|||
#!/opt/local/bin/python
|
||||
import scrapelib
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os.path
|
||||
import sys
|
||||
import time
|
||||
import urllib2
|
||||
|
||||
argParser = argparse.ArgumentParser(description = 'Download Wikipedia for a given locale.')
|
||||
argParser.add_argument("--locale", required = True)
|
||||
argParser.add_argument("--minlat", type=float, default=-1000)
|
||||
argParser.add_argument("--maxlat", type=float, default=1000)
|
||||
argParser.add_argument("--minlon", type=float, default=-1000)
|
||||
argParser.add_argument("--maxlon", type=float, default=1000)
|
||||
ARGS = argParser.parse_args()
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
(itemId, lat, lon, itemType, title) = json.loads(line)
|
||||
|
||||
if lat >= ARGS.minlat and lat <= ARGS.maxlat and lon >= ARGS.minlon and lon <= ARGS.maxlon:
|
||||
fileName = urllib2.quote(title.encode("utf-8"), " ()") + ".html"
|
||||
url = "http://{0}.wikipedia.org/w/index.php?curid={1}&useformat=mobile".format(ARGS.locale, itemId)
|
||||
|
||||
if title.find('_') != -1:
|
||||
sys.stderr.write('WARNING! Title contains "_". It will not be found!\n')
|
||||
|
||||
scrapelib.ScrapeUrl(url, fileName, 1, i)
|
Loading…
Add table
Reference in a new issue