New Wikitravel crawler.

This commit is contained in:
Yury Melnichek 2011-04-02 19:41:16 +02:00 committed by Alex Zolotarev
parent 4f1cc054ea
commit 571ca3397f
8 changed files with 69 additions and 95 deletions

View file

@ -1,53 +0,0 @@
on run argv
-- Load page and wait until it is loaded
tell application "Google Chrome"
activate
set myTab to make new tab at end of tabs of window 1
tell myTab
set URL to item 1 of argv -- "http://www.wikipedia.org"
repeat -- wait completion of loading
set curStat to loading
if curStat = false then exit repeat
delay 0.25
end repeat
end tell
end tell
delay 1
-- Click the save button
repeat 10 times
try
tell application "System Events"
tell process "Google Chrome"
set saveButton to button 5 of tool bar 1 of window 1
click saveButton
exit repeat
end tell
end tell
on error
delay 1
end try
end repeat
-- Wait for the file created
-- repeat while not (exists file (item 2 of argv) of application "Finder")
-- delay 1
-- end repeat
-- Wait for file stopped growing
-- set resFile to (POSIX file (item 2 of argv))
-- set size0 to 0
-- set size1 to size of (info for resFile)
-- repeat while size0 size1
-- delay 0.25
-- set size0 to size1
-- set size1 to size of (info for resFile)
-- end repeat
delay 5
tell myTab
delete
end tell
end run

View file

@ -1,29 +0,0 @@
#!/bin/bash
set -e -u -x
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
SAVED_PATH="${HOME}/Library/Application Support/Google/Chrome/Default/FileSystem/chrome-extension_jemlklgaibiijojffihnhieihhagocma_0/Persistent/chrome-oWHMA7fJwDx8JDjs"
SAVED_FILE="${SAVED_PATH}/${2}"
rm "$SAVED_FILE" || true
for i in $(cat $1) ; do
if ! osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
then
echo "applescript failed";
sleep 10s
osascript "$MY_PATH/download.applescript" "$i" "${SAVED_FILE}"
fi
if [ ! -f "${SAVED_FILE}" ]
then
sleep 5s
fi
if [ ! -f "${SAVED_FILE}" ]
then
echo "file not found"
exit 1
fi
mv "${SAVED_FILE}" $3/${i##*/}".html"
done

24
crawler/wikitravel-crawler.sh Executable file
View file

@ -0,0 +1,24 @@
#!/bin/bash
set -e -u -x
MY_PATH=`dirname $0`
$MY_PATH/wikitravel-download-lists.sh
cat wikitravel-redirects-*.html \
| $MY_PATH/wikitravel-process-redirects.py \
| grep -v Diving_the_Cape_Peninsula \
| grep -v '[^\s]*:' \
> wikitravel-redirects.json
cat wikitravel-pages-*.html \
| $MY_PATH/wikitravel-process-pages.py \
| grep -v Diving_the_Cape_Peninsula \
> wikitravel-pages.json
wc -l wikitravel-pages.json
cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py
# TODO: Strip articles
# TODO: Run publisher.

View file

@ -0,0 +1,23 @@
#!/usr/bin/python
import json
import os.path
import sys
import time
import urllib2
for i, line in enumerate(sys.stdin):
(url, title, fileName) = json.loads(line)
if os.path.exists(fileName):
sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
else:
sys.stderr.write('Downloading {0} {1}\n'.format(i, fileName))
remoteFile = urllib2.urlopen(url)
data = remoteFile.read();
remoteFile.close()
localFile = open(fileName, 'w')
localFile.write(data)
localFile.close()
time.sleep(1)

View file

@ -1,13 +0,0 @@
#!/bin/bash
set -e -u -x
MY_PATH=`dirname $(stat -f %N $PWD"/"$0)`
cat wikitravel-pages-*.html \
| egrep '<a href=\"/en/.+?bytes]</li>' -o \
| sed "s@<a href=\"@http://m.wikitravel.org@" \
| sed "s@\" title=.*</a>.*bytes]</li>@@" \
| grep -v phrasebook \
| grep -v "Diving_the_Cape_Peninsula" \
> wikitravel-urls.txt
# $MY_PATH/download.sh wikitravel-urls.txt "WikiTravel Mobile.html" ./

View file

@ -0,0 +1,13 @@
#!/usr/bin/python
import hashlib
import json
import re
import string
import sys
input = sys.stdin.read()
pages = re.findall('<a href="/en/(.+?)" title="(.+?)".+?bytes]</li>', input)
for page in pages:
print json.dumps(("http://m.wikitravel.org/en/" + page[0],
page[1],
string.replace(page[0], '/', '_') + '_' + hashlib.md5(page[0]).hexdigest()[:8]))

View file

@ -0,0 +1,9 @@
#!/usr/bin/python
import json
import re
import sys
input = sys.stdin.read()
redirects = re.findall('<li><a .*? title="(.+?)">.*?</a>.*?<a .*? title="(.+?)">.*?</a></li>', input)
for redirect in redirects:
print json.dumps(redirect)