diff --git a/crawler/normalize-image-urls.sh b/crawler/normalize-image-urls.sh new file mode 100755 index 0000000000..ee045b1df6 --- /dev/null +++ b/crawler/normalize-image-urls.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e -u -x + +cat $1 | sed 's:/thumb\(/.*\)/[0-9][0-9]*px-.*$:\1:' | sort -u > $2 diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh index 58fd1a2f3f..dee0e843a1 100755 --- a/crawler/wikitravel-crawler.sh +++ b/crawler/wikitravel-crawler.sh @@ -28,6 +28,8 @@ cat wikitravel-pages.json | python $MY_PATH/wikitravel-optimize-articles.py $MY_PATH/extract-image-urls.sh wikitravel-images.urls -wget --wait=1 --no-clobber -i wikitravel-images.urls +$MY_PATH/normalize-image-urls.sh wikitravel-images.urls wikitravel-images-normalized.url + +wget --wait=1 --random-wait --no-clobber -i wikitravel-images-normalized.urls # TODO: Run publisher.