forked from organicmaps/organicmaps
Wikitravel crawler.
This commit is contained in:
parent
785c66357f
commit
cfa04d35e0
4 changed files with 113 additions and 0 deletions
53
crawler/download.applescript
Executable file
53
crawler/download.applescript
Executable file
|
@ -0,0 +1,53 @@
|
|||
on run argv
|
||||
-- Load page and wait until it is loaded
|
||||
tell application "Google Chrome"
|
||||
activate
|
||||
set myTab to make new tab at end of tabs of window 1
|
||||
tell myTab
|
||||
set URL to item 1 of argv -- "http://www.wikipedia.org"
|
||||
repeat -- wait completion of loading
|
||||
set curStat to loading
|
||||
if curStat = false then exit repeat
|
||||
delay 0.25
|
||||
end repeat
|
||||
end tell
|
||||
end tell
|
||||
|
||||
delay 1
|
||||
|
||||
-- Click the save button
|
||||
repeat 10 times
|
||||
try
|
||||
tell application "System Events"
|
||||
tell process "Google Chrome"
|
||||
set saveButton to button 5 of tool bar 1 of window 1
|
||||
click saveButton
|
||||
exit repeat
|
||||
end tell
|
||||
end tell
|
||||
on error
|
||||
delay 1
|
||||
end try
|
||||
end repeat
|
||||
|
||||
-- Wait for the file created
|
||||
-- repeat while not (exists file (item 2 of argv) of application "Finder")
|
||||
-- delay 1
|
||||
-- end repeat
|
||||
|
||||
-- Wait for file stopped growing
|
||||
-- set resFile to (POSIX file (item 2 of argv))
|
||||
-- set size0 to 0
|
||||
-- set size1 to size of (info for resFile)
|
||||
-- repeat while size0 size1
|
||||
-- delay 0.25
|
||||
-- set size0 to size1
|
||||
-- set size1 to size of (info for resFile)
|
||||
-- end repeat
|
||||
|
||||
delay 5
|
||||
|
||||
tell myTab
|
||||
delete
|
||||
end tell
|
||||
end run
|
29
crawler/download.sh
Executable file
29
crawler/download.sh
Executable file
|
@ -0,0 +1,29 @@
|
|||
#!/bin/bash
|
||||
set -e -u -x
|
||||
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
|
||||
SAVED_PATH="${HOME}/Library/Application Support/Google/Chrome/Default/FileSystem/chrome-extension_jemlklgaibiijojffihnhieihhagocma_0/Persistent/chrome-oWHMA7fJwDx8JDjs"
|
||||
SAVED_FILE="${SAVED_PATH}/${2}"
|
||||
|
||||
rm "$SAVED_FILE" || true
|
||||
|
||||
for i in $(cat $1) ; do
|
||||
if ! osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
|
||||
then
|
||||
echo "applescript failed";
|
||||
sleep 10s
|
||||
osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
|
||||
fi
|
||||
|
||||
if [ ! -f "${SAVED_FILE}" ]
|
||||
then
|
||||
sleep 5s
|
||||
fi
|
||||
|
||||
if [ ! -f "${SAVED_FILE}" ]
|
||||
then
|
||||
echo "file not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mv "${SAVED_FILE}" $3/${i##*/}".html"
|
||||
done
|
13
crawler/wikitravel-download.sh
Executable file
13
crawler/wikitravel-download.sh
Executable file
|
@ -0,0 +1,13 @@
|
|||
#!/bin/bash
|
||||
set -e -u -x
|
||||
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
|
||||
|
||||
cat wikitravel-pages-*.html \
|
||||
| egrep '<a href=\"/en/.+?bytes]</li>' -o \
|
||||
| sed "s@<a href=\"@http://m.wikitravel.org@" \
|
||||
| sed "s@\" title=.*</a>.*bytes]</li>@@" \
|
||||
| grep -v phrasebook \
|
||||
| grep -v "Diving_the_Cape_Peninsula" \
|
||||
> wikitravel-urls.txt
|
||||
|
||||
# $MY_PATH/download.sh wikitravel-urls.txt "WikiTravel Mobile.html" ./
|
18
crawler/wikitravel-get-lists.sh
Executable file
18
crawler/wikitravel-get-lists.sh
Executable file
|
@ -0,0 +1,18 @@
|
|||
#!/bin/bash
|
||||
set -e -u -x
|
||||
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
|
||||
|
||||
LONGPAGES_URL="http://wikitravel.org/wiki/en/index.php?title=Special:Longpages"
|
||||
REDIRECTS_URL="http://wikitravel.org/wiki/en/index.php?title=Special:Listredirects"
|
||||
|
||||
# Get all pages.
|
||||
wget $LONGPAGES_URL"&limit=5000&offset=0" -O wikitravel-pages-0.html && sleep 10s
|
||||
wget $LONGPAGES_URL"&limit=5000&offset=5000" -O wikitravel-pages-1.html && sleep 10s
|
||||
wget $LONGPAGES_URL"&limit=5000&offset=10000" -O wikitravel-pages-2.html && sleep 10s
|
||||
wget $LONGPAGES_URL"&limit=5000&offset=15000" -O wikitravel-pages-3.html && sleep 10s
|
||||
|
||||
# Get all redirects.
|
||||
wget $REDIRECTS_URL"&limit=5000&offset=0" -O wikitravel-redirects-0.html && sleep 10s
|
||||
wget $REDIRECTS_URL"&limit=5000&offset=5000" -O wikitravel-redirects-1.html && sleep 10s
|
||||
wget $REDIRECTS_URL"&limit=5000&offset=10000" -O wikitravel-redirects-2.html && sleep 10s
|
||||
wget $REDIRECTS_URL"&limit=5000&offset=15000" -O wikitravel-redirects-3.html && sleep 10s
|
Loading…
Add table
Reference in a new issue