diff --git a/crawler/scrapelib.py b/crawler/scrapelib.py new file mode 100755 index 0000000000..05885b6236 --- /dev/null +++ b/crawler/scrapelib.py @@ -0,0 +1,38 @@ +#!/opt/local/bin/python + +import os.path +import sys +import time +import urllib2 + +USER_AGENT="MapsWithMe Scraper " + +def ScrapeUrl(url, fileName, delay, info, maxTryCount = 5, userAgent = USER_AGENT): + if os.path.exists(fileName): + sys.stderr.write('Skipping existing {0} {1}\n'.format(info, fileName)) + else: + sys.stderr.write('Downloading {0} {1} {2} \n'.format(info, fileName, url)) + + tryCount = 0 + while True: + try: + tryCount = tryCount + 1 + remoteFile = urllib2.urlopen(urllib2.Request(url, None, { "User-Agent" : userAgent })) + + try: + data = remoteFile.read() + finally: + remoteFile.close() + break + except IOError as error: + sys.stderr.write('Try {0}, error: {1}\n'.format(tryCount, error)) + if tryCount >= maxTryCount: + raise + else: + time.sleep(delay) + + localFile = open(fileName, 'w') + localFile.write(data) + localFile.close() + + time.sleep(delay) diff --git a/crawler/wikipedia-crawler.sh b/crawler/wikipedia-crawler.sh new file mode 100755 index 0000000000..2f2e835950 --- /dev/null +++ b/crawler/wikipedia-crawler.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e -u -x +MY_PATH=`dirname $0` +WIKI_LOCALE="en" +NOW=$(date +"%Y-%m-%d-%H-%M") + +wget http://wikilocation.org/mysqldumps/$WIKI_LOCALE.sql.gz + +gunzip $WIKI_LOCALE.sql.gz + +cat $WIKI_LOCALE.sql | grep "INSERT INTO" > $WIKI_LOCALE.inserts + +cat $WIKI_LOCALE.inserts | sed 's/INSERT INTO .* VALUES (/[/g' | sed 's/),(/]\n[/g' | sed 's/);/]/g' | sed "s/','/\",\"/g" | sed "s/,'/,\"/g" | sed "s/']/\"]/g" | sed "s/\\\'/\\'/g" > $WIKI_LOCALE-pages.json + +cat $WIKI_LOCALE-pages.json | python $MY_PATH/wikipedia-download-pages.py --locale=$WIKI_LOCALE --minlat=45.8 --maxlat=47.83 --minlon=5.93 --maxlon=10.54 2>&1 | tee wikipedia-download.log.$NOW + +# TODO: Run publisher. diff --git a/crawler/wikipedia-download-pages.py b/crawler/wikipedia-download-pages.py new file mode 100755 index 0000000000..fafffcb324 --- /dev/null +++ b/crawler/wikipedia-download-pages.py @@ -0,0 +1,29 @@ +#!/opt/local/bin/python +import scrapelib + +import argparse +import json +import os.path +import sys +import time +import urllib2 + +argParser = argparse.ArgumentParser(description = 'Download Wikipedia for a given locale.') +argParser.add_argument("--locale", required = True) +argParser.add_argument("--minlat", type=float, default=-1000) +argParser.add_argument("--maxlat", type=float, default=1000) +argParser.add_argument("--minlon", type=float, default=-1000) +argParser.add_argument("--maxlon", type=float, default=1000) +ARGS = argParser.parse_args() + +for i, line in enumerate(sys.stdin): + (itemId, lat, lon, itemType, title) = json.loads(line) + + if lat >= ARGS.minlat and lat <= ARGS.maxlat and lon >= ARGS.minlon and lon <= ARGS.maxlon: + fileName = urllib2.quote(title.encode("utf-8"), " ()") + ".html" + url = "http://{0}.wikipedia.org/w/index.php?curid={1}&useformat=mobile".format(ARGS.locale, itemId) + + if title.find('_') != -1: + sys.stderr.write('WARNING! Title contains "_". It will not be found!\n') + + scrapelib.ScrapeUrl(url, fileName, 1, i)