diff --git a/crawler/wikitravel-crawler.sh b/crawler/wikitravel-crawler.sh index 8a7ee331ed..c5dbeca43e 100755 --- a/crawler/wikitravel-crawler.sh +++ b/crawler/wikitravel-crawler.sh @@ -2,31 +2,24 @@ set -e -u -x MY_PATH=`dirname $0` -# $MY_PATH/wikitravel-download-lists.sh +$MY_PATH/wikitravel-download-lists.sh -# cat wikitravel-redirects-*.html \ -# | $MY_PATH/wikitravel-process-redirects.py \ -# | grep -v Diving_the_Cape_Peninsula \ -# | grep -v '[^\s]*:' \ -# > wikitravel-redirects.json +cat wikitravel-redirects-*.html \ + | python $MY_PATH/wikitravel-process-redirects.py \ + | grep -v '[^\s]*:' \ + > wikitravel-redirects.json -# cat wikitravel-pages-*.html \ -# | $MY_PATH/wikitravel-process-pages.py \ -# | grep -v Diving_the_Cape_Peninsula \ -# > wikitravel-pages.json +cat wikitravel-pages-*.html \ + | python $MY_PATH/wikitravel-process-pages.py \ + > wikitravel-pages.json -# wc -l wikitravel-pages.json +echo "Total pages:" +wc -l wikitravel-pages.json -cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py +cat wikitravel-pages.json | python $MY_PATH/wikitravel-download-pages.py -cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py +cat wikitravel-pages.json | python $MY_PATH/wikitravel-process-articles.py -cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py - -#for file in *.article -#do -#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \ -# --remove-js-protocol --type html -o "${file}.opt" "${file}" -#done +cat wikitravel-pages.json | python $MY_PATH/wikitravel-optimize-articles.py # TODO: Run publisher. diff --git a/crawler/wikitravel-download-lists.sh b/crawler/wikitravel-download-lists.sh index 16f6c8f764..0ec2f38685 100755 --- a/crawler/wikitravel-download-lists.sh +++ b/crawler/wikitravel-download-lists.sh @@ -14,4 +14,5 @@ wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && slee wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s -wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s +# last one is empty +# wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s