From f8b8a13a870024f5a20e0efdfa9c12239fa1cad9 Mon Sep 17 00:00:00 2001 From: Yury Melnichek Date: Mon, 17 Sep 2012 12:13:25 +0200 Subject: [PATCH] [crawler] Download full wikitravel images, not thumbnails. --- crawler/normalize-image-urls.sh | 4 ++++ crawler/wikitravel-crawler.sh | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100755 crawler/normalize-image-urls.sh diff --git a/crawler/normalize-image-urls.sh b/crawler/normalize-image-urls.sh new file mode 100755 index 0000000000..ee045b1df6 --- /dev/null +++ b/crawler/normalize-image-urls.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e -u -x + +cat $1 | sed 's:/thumb\(/.*\)/[0-9][0-9]*px-.*$:\1:' | sort -u > $2 diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh index 58fd1a2f3f..dee0e843a1 100755 --- a/crawler/wikitravel-crawler.sh +++ b/crawler/wikitravel-crawler.sh @@ -28,6 +28,8 @@ cat wikitravel-pages.json | python $MY_PATH/wikitravel-optimize-articles.py $MY_PATH/extract-image-urls.sh wikitravel-images.urls -wget --wait=1 --no-clobber -i wikitravel-images.urls +$MY_PATH/normalize-image-urls.sh wikitravel-images.urls wikitravel-images-normalized.url + +wget --wait=1 --random-wait --no-clobber -i wikitravel-images-normalized.urls # TODO: Run publisher.