forked from organicmaps/organicmaps
Updated crawler scripts
This commit is contained in:
parent
e5a18b9823
commit
a779e98302
2 changed files with 15 additions and 21 deletions
|
@ -2,31 +2,24 @@
|
|||
set -e -u -x
|
||||
MY_PATH=`dirname $0`
|
||||
|
||||
# $MY_PATH/wikitravel-download-lists.sh
|
||||
$MY_PATH/wikitravel-download-lists.sh
|
||||
|
||||
# cat wikitravel-redirects-*.html \
|
||||
# | $MY_PATH/wikitravel-process-redirects.py \
|
||||
# | grep -v Diving_the_Cape_Peninsula \
|
||||
# | grep -v '[^\s]*:' \
|
||||
# > wikitravel-redirects.json
|
||||
cat wikitravel-redirects-*.html \
|
||||
| python $MY_PATH/wikitravel-process-redirects.py \
|
||||
| grep -v '[^\s]*:' \
|
||||
> wikitravel-redirects.json
|
||||
|
||||
# cat wikitravel-pages-*.html \
|
||||
# | $MY_PATH/wikitravel-process-pages.py \
|
||||
# | grep -v Diving_the_Cape_Peninsula \
|
||||
# > wikitravel-pages.json
|
||||
cat wikitravel-pages-*.html \
|
||||
| python $MY_PATH/wikitravel-process-pages.py \
|
||||
> wikitravel-pages.json
|
||||
|
||||
# wc -l wikitravel-pages.json
|
||||
echo "Total pages:"
|
||||
wc -l wikitravel-pages.json
|
||||
|
||||
cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py
|
||||
cat wikitravel-pages.json | python $MY_PATH/wikitravel-download-pages.py
|
||||
|
||||
cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py
|
||||
cat wikitravel-pages.json | python $MY_PATH/wikitravel-process-articles.py
|
||||
|
||||
cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py
|
||||
|
||||
#for file in *.article
|
||||
#do
|
||||
#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \
|
||||
# --remove-js-protocol --type html -o "${file}.opt" "${file}"
|
||||
#done
|
||||
cat wikitravel-pages.json | python $MY_PATH/wikitravel-optimize-articles.py
|
||||
|
||||
# TODO: Run publisher.
|
||||
|
|
|
@ -14,4 +14,5 @@ wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && slee
|
|||
wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s
|
||||
wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s
|
||||
wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s
|
||||
wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s
|
||||
# last one is empty
|
||||
# wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s
|
||||
|
|
Loading…
Add table
Reference in a new issue