From ad647d0de6262b4a11c112fc31cff8ae82ff69ac Mon Sep 17 00:00:00 2001 From: Yury Melnichek Date: Fri, 6 Apr 2012 02:19:12 +0200 Subject: [PATCH] [crawler] Wikipedia scraper: ignore meridians and parallels. --- crawler/wikipedia-download-pages.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/crawler/wikipedia-download-pages.py b/crawler/wikipedia-download-pages.py index fafffcb324..e4d869b536 100755 --- a/crawler/wikipedia-download-pages.py +++ b/crawler/wikipedia-download-pages.py @@ -19,11 +19,17 @@ ARGS = argParser.parse_args() for i, line in enumerate(sys.stdin): (itemId, lat, lon, itemType, title) = json.loads(line) - if lat >= ARGS.minlat and lat <= ARGS.maxlat and lon >= ARGS.minlon and lon <= ARGS.maxlon: - fileName = urllib2.quote(title.encode("utf-8"), " ()") + ".html" - url = "http://{0}.wikipedia.org/w/index.php?curid={1}&useformat=mobile".format(ARGS.locale, itemId) + if lat < ARGS.minlat or lat > ARGS.maxlat or lon < ARGS.minlon or lon > ARGS.maxlon: + continue - if title.find('_') != -1: - sys.stderr.write('WARNING! Title contains "_". It will not be found!\n') + if itemType == 'country' and (int(lat) == lat or int(lon) == lon): + sys.stderr.write('Ignoring country {0} {1} - probably parallel or meridian\n') + continue - scrapelib.ScrapeUrl(url, fileName, 1, i) + fileName = urllib2.quote(title.encode("utf-8"), " ()") + ".html" + url = "http://{0}.wikipedia.org/w/index.php?curid={1}&useformat=mobile".format(ARGS.locale, itemId) + + if title.find('_') != -1: + sys.stderr.write('WARNING! Title contains "_". It will not be found!\n') + + scrapelib.ScrapeUrl(url, fileName, 1, i)