forked from organicmaps/organicmaps
New Wikitravel crawler.
This commit is contained in:
parent
4f1cc054ea
commit
571ca3397f
8 changed files with 69 additions and 95 deletions
|
@ -1,53 +0,0 @@
|
|||
on run argv
|
||||
-- Load page and wait until it is loaded
|
||||
tell application "Google Chrome"
|
||||
activate
|
||||
set myTab to make new tab at end of tabs of window 1
|
||||
tell myTab
|
||||
set URL to item 1 of argv -- "http://www.wikipedia.org"
|
||||
repeat -- wait completion of loading
|
||||
set curStat to loading
|
||||
if curStat = false then exit repeat
|
||||
delay 0.25
|
||||
end repeat
|
||||
end tell
|
||||
end tell
|
||||
|
||||
delay 1
|
||||
|
||||
-- Click the save button
|
||||
repeat 10 times
|
||||
try
|
||||
tell application "System Events"
|
||||
tell process "Google Chrome"
|
||||
set saveButton to button 5 of tool bar 1 of window 1
|
||||
click saveButton
|
||||
exit repeat
|
||||
end tell
|
||||
end tell
|
||||
on error
|
||||
delay 1
|
||||
end try
|
||||
end repeat
|
||||
|
||||
-- Wait for the file created
|
||||
-- repeat while not (exists file (item 2 of argv) of application "Finder")
|
||||
-- delay 1
|
||||
-- end repeat
|
||||
|
||||
-- Wait for file stopped growing
|
||||
-- set resFile to (POSIX file (item 2 of argv))
|
||||
-- set size0 to 0
|
||||
-- set size1 to size of (info for resFile)
|
||||
-- repeat while size0 size1
|
||||
-- delay 0.25
|
||||
-- set size0 to size1
|
||||
-- set size1 to size of (info for resFile)
|
||||
-- end repeat
|
||||
|
||||
delay 5
|
||||
|
||||
tell myTab
|
||||
delete
|
||||
end tell
|
||||
end run
|
|
@ -1,29 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e -u -x
|
||||
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
|
||||
SAVED_PATH="${HOME}/Library/Application Support/Google/Chrome/Default/FileSystem/chrome-extension_jemlklgaibiijojffihnhieihhagocma_0/Persistent/chrome-oWHMA7fJwDx8JDjs"
|
||||
SAVED_FILE="${SAVED_PATH}/${2}"
|
||||
|
||||
rm "$SAVED_FILE" || true
|
||||
|
||||
for i in $(cat $1) ; do
|
||||
if ! osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
|
||||
then
|
||||
echo "applescript failed";
|
||||
sleep 10s
|
||||
osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
|
||||
fi
|
||||
|
||||
if [ ! -f "${SAVED_FILE}" ]
|
||||
then
|
||||
sleep 5s
|
||||
fi
|
||||
|
||||
if [ ! -f "${SAVED_FILE}" ]
|
||||
then
|
||||
echo "file not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mv "${SAVED_FILE}" $3/${i##*/}".html"
|
||||
done
|
24
crawler/wikitravel-crawler.sh
Executable file
24
crawler/wikitravel-crawler.sh
Executable file
|
@ -0,0 +1,24 @@
|
|||
#!/bin/bash
|
||||
set -e -u -x
|
||||
MY_PATH=`dirname $0`
|
||||
|
||||
$MY_PATH/wikitravel-download-lists.sh
|
||||
|
||||
cat wikitravel-redirects-*.html \
|
||||
| $MY_PATH/wikitravel-process-redirects.py \
|
||||
| grep -v Diving_the_Cape_Peninsula \
|
||||
| grep -v '[^\s]*:' \
|
||||
> wikitravel-redirects.json
|
||||
|
||||
cat wikitravel-pages-*.html \
|
||||
| $MY_PATH/wikitravel-process-pages.py \
|
||||
| grep -v Diving_the_Cape_Peninsula \
|
||||
> wikitravel-pages.json
|
||||
|
||||
wc -l wikitravel-pages.json
|
||||
|
||||
cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py
|
||||
|
||||
# TODO: Strip articles
|
||||
|
||||
# TODO: Run publisher.
|
23
crawler/wikitravel-download-pages.py
Executable file
23
crawler/wikitravel-download-pages.py
Executable file
|
@ -0,0 +1,23 @@
|
|||
#!/usr/bin/python
|
||||
import json
|
||||
import os.path
|
||||
import sys
|
||||
import time
|
||||
import urllib2
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
(url, title, fileName) = json.loads(line)
|
||||
if os.path.exists(fileName):
|
||||
sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
|
||||
else:
|
||||
sys.stderr.write('Downloading {0} {1}\n'.format(i, fileName))
|
||||
|
||||
remoteFile = urllib2.urlopen(url)
|
||||
data = remoteFile.read();
|
||||
remoteFile.close()
|
||||
|
||||
localFile = open(fileName, 'w')
|
||||
localFile.write(data)
|
||||
localFile.close()
|
||||
|
||||
time.sleep(1)
|
|
@ -1,13 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e -u -x
|
||||
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
|
||||
|
||||
cat wikitravel-pages-*.html \
|
||||
| egrep '<a href=\"/en/.+?bytes]</li>' -o \
|
||||
| sed "s@<a href=\"@http://m.wikitravel.org@" \
|
||||
| sed "s@\" title=.*</a>.*bytes]</li>@@" \
|
||||
| grep -v phrasebook \
|
||||
| grep -v "Diving_the_Cape_Peninsula" \
|
||||
> wikitravel-urls.txt
|
||||
|
||||
# $MY_PATH/download.sh wikitravel-urls.txt "WikiTravel Mobile.html" ./
|
13
crawler/wikitravel-process-pages.py
Executable file
13
crawler/wikitravel-process-pages.py
Executable file
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/python
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
input = sys.stdin.read()
|
||||
pages = re.findall('<a href="/en/(.+?)" title="(.+?)".+?bytes]</li>', input)
|
||||
for page in pages:
|
||||
print json.dumps(("http://m.wikitravel.org/en/" + page[0],
|
||||
page[1],
|
||||
string.replace(page[0], '/', '_') + '_' + hashlib.md5(page[0]).hexdigest()[:8]))
|
9
crawler/wikitravel-process-redirects.py
Executable file
9
crawler/wikitravel-process-redirects.py
Executable file
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/python
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
||||
input = sys.stdin.read()
|
||||
redirects = re.findall('<li><a .*? title="(.+?)">.*?</a>.*?<a .*? title="(.+?)">.*?</a></li>', input)
|
||||
for redirect in redirects:
|
||||
print json.dumps(redirect)
|
Loading…
Add table
Reference in a new issue