Updated crawler scripts

This commit is contained in:
Alex Zolotarev 2011-09-14 20:59:39 +02:00 committed by Alex Zolotarev
parent e5a18b9823
commit a779e98302
2 changed files with 15 additions and 21 deletions

View file

@ -2,31 +2,24 @@
set -e -u -x
MY_PATH=`dirname $0`
# $MY_PATH/wikitravel-download-lists.sh
$MY_PATH/wikitravel-download-lists.sh
# cat wikitravel-redirects-*.html \
# | $MY_PATH/wikitravel-process-redirects.py \
# | grep -v Diving_the_Cape_Peninsula \
# | grep -v '[^\s]*:' \
# > wikitravel-redirects.json
cat wikitravel-redirects-*.html \
| python $MY_PATH/wikitravel-process-redirects.py \
| grep -v '[^\s]*:' \
> wikitravel-redirects.json
# cat wikitravel-pages-*.html \
# | $MY_PATH/wikitravel-process-pages.py \
# | grep -v Diving_the_Cape_Peninsula \
# > wikitravel-pages.json
cat wikitravel-pages-*.html \
| python $MY_PATH/wikitravel-process-pages.py \
> wikitravel-pages.json
# wc -l wikitravel-pages.json
echo "Total pages:"
wc -l wikitravel-pages.json
cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py
cat wikitravel-pages.json | python $MY_PATH/wikitravel-download-pages.py
cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py
cat wikitravel-pages.json | python $MY_PATH/wikitravel-process-articles.py
cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py
#for file in *.article
#do
#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \
# --remove-js-protocol --type html -o "${file}.opt" "${file}"
#done
cat wikitravel-pages.json | python $MY_PATH/wikitravel-optimize-articles.py
# TODO: Run publisher.

View file

@ -14,4 +14,5 @@ wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && slee
wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s
wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s
wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s
wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s
# last one is empty
# wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s