Wikitravel crawler.

This commit is contained in:
Yury Melnichek 2011-03-15 01:07:34 +01:00 committed by Alex Zolotarev
parent 785c66357f
commit cfa04d35e0
4 changed files with 113 additions and 0 deletions

53
crawler/download.applescript Executable file
View file

@ -0,0 +1,53 @@
on run argv
-- Load page and wait until it is loaded
tell application "Google Chrome"
activate
set myTab to make new tab at end of tabs of window 1
tell myTab
set URL to item 1 of argv -- "http://www.wikipedia.org"
repeat -- wait completion of loading
set curStat to loading
if curStat = false then exit repeat
delay 0.25
end repeat
end tell
end tell
delay 1
-- Click the save button
repeat 10 times
try
tell application "System Events"
tell process "Google Chrome"
set saveButton to button 5 of tool bar 1 of window 1
click saveButton
exit repeat
end tell
end tell
on error
delay 1
end try
end repeat
-- Wait for the file created
-- repeat while not (exists file (item 2 of argv) of application "Finder")
-- delay 1
-- end repeat
-- Wait for file stopped growing
-- set resFile to (POSIX file (item 2 of argv))
-- set size0 to 0
-- set size1 to size of (info for resFile)
-- repeat while size0 size1
-- delay 0.25
-- set size0 to size1
-- set size1 to size of (info for resFile)
-- end repeat
delay 5
tell myTab
delete
end tell
end run

29
crawler/download.sh Executable file
View file

@ -0,0 +1,29 @@
#!/bin/bash
set -e -u -x
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
SAVED_PATH="${HOME}/Library/Application Support/Google/Chrome/Default/FileSystem/chrome-extension_jemlklgaibiijojffihnhieihhagocma_0/Persistent/chrome-oWHMA7fJwDx8JDjs"
SAVED_FILE="${SAVED_PATH}/${2}"
rm "$SAVED_FILE" || true
for i in $(cat $1) ; do
if ! osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
then
echo "applescript failed";
sleep 10s
osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
fi
if [ ! -f "${SAVED_FILE}" ]
then
sleep 5s
fi
if [ ! -f "${SAVED_FILE}" ]
then
echo "file not found"
exit 1
fi
mv "${SAVED_FILE}" $3/${i##*/}".html"
done

13
crawler/wikitravel-download.sh Executable file
View file

@ -0,0 +1,13 @@
#!/bin/bash
set -e -u -x
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
cat wikitravel-pages-*.html \
| egrep '<a href=\"/en/.+?bytes]</li>' -o \
| sed "s@<a href=\"@http://m.wikitravel.org@" \
| sed "s@\" title=.*</a>.*bytes]</li>@@" \
| grep -v phrasebook \
| grep -v "Diving_the_Cape_Peninsula" \
> wikitravel-urls.txt
# $MY_PATH/download.sh wikitravel-urls.txt "WikiTravel Mobile.html" ./

18
crawler/wikitravel-get-lists.sh Executable file
View file

@ -0,0 +1,18 @@
#!/bin/bash
set -e -u -x
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
LONGPAGES_URL="http://wikitravel.org/wiki/en/index.php?title=Special:Longpages"
REDIRECTS_URL="http://wikitravel.org/wiki/en/index.php?title=Special:Listredirects"
# Get all pages.
wget $LONGPAGES_URL"&limit=5000&offset=0" -O wikitravel-pages-0.html && sleep 10s
wget $LONGPAGES_URL"&limit=5000&offset=5000" -O wikitravel-pages-1.html && sleep 10s
wget $LONGPAGES_URL"&limit=5000&offset=10000" -O wikitravel-pages-2.html && sleep 10s
wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && sleep 10s
# Get all redirects.
wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s
wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s
wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s
wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s