forked from organicmaps/organicmaps
[crawler] Major update. Changed python location, added article processor (wikitravel-process-articles.py) and optimizer (htmlcompressor.jar and wikitravel-optimize-articles.py).
This commit is contained in:
parent
ff1d7a5a82
commit
3ca8c59cd4
8 changed files with 220 additions and 14 deletions
BIN
crawler/htmlcompressor.jar
Normal file
BIN
crawler/htmlcompressor.jar
Normal file
Binary file not shown.
|
@ -2,23 +2,31 @@
|
|||
set -e -u -x
|
||||
MY_PATH=`dirname $0`
|
||||
|
||||
$MY_PATH/wikitravel-download-lists.sh
|
||||
# $MY_PATH/wikitravel-download-lists.sh
|
||||
|
||||
cat wikitravel-redirects-*.html \
|
||||
| $MY_PATH/wikitravel-process-redirects.py \
|
||||
| grep -v Diving_the_Cape_Peninsula \
|
||||
| grep -v '[^\s]*:' \
|
||||
> wikitravel-redirects.json
|
||||
# cat wikitravel-redirects-*.html \
|
||||
# | $MY_PATH/wikitravel-process-redirects.py \
|
||||
# | grep -v Diving_the_Cape_Peninsula \
|
||||
# | grep -v '[^\s]*:' \
|
||||
# > wikitravel-redirects.json
|
||||
|
||||
cat wikitravel-pages-*.html \
|
||||
| $MY_PATH/wikitravel-process-pages.py \
|
||||
| grep -v Diving_the_Cape_Peninsula \
|
||||
> wikitravel-pages.json
|
||||
# cat wikitravel-pages-*.html \
|
||||
# | $MY_PATH/wikitravel-process-pages.py \
|
||||
# | grep -v Diving_the_Cape_Peninsula \
|
||||
# > wikitravel-pages.json
|
||||
|
||||
wc -l wikitravel-pages.json
|
||||
# wc -l wikitravel-pages.json
|
||||
|
||||
cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py
|
||||
|
||||
# TODO: Strip articles
|
||||
cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py
|
||||
|
||||
cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py
|
||||
|
||||
#for file in *.article
|
||||
#do
|
||||
#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \
|
||||
# --remove-js-protocol --type html -o "${file}.opt" "${file}"
|
||||
#done
|
||||
|
||||
# TODO: Run publisher.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/python
|
||||
#!/opt/local/bin/python
|
||||
import json
|
||||
import os.path
|
||||
import sys
|
||||
|
|
2
crawler/wikitravel-footer.html
Normal file
2
crawler/wikitravel-footer.html
Normal file
|
@ -0,0 +1,2 @@
|
|||
</body>
|
||||
</html>
|
111
crawler/wikitravel-header.html
Normal file
111
crawler/wikitravel-header.html
Normal file
|
@ -0,0 +1,111 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/>
|
||||
<script type="text/javascript">
|
||||
function tg(id) {
|
||||
if (document.getElementById('section-'+id).style.display == 'block') {
|
||||
document.getElementById('section-'+id).style.display = 'none';
|
||||
document.getElementById('button-'+id).innerHTML = 'Show';
|
||||
} else {
|
||||
document.getElementById('section-'+id).style.display = 'block';
|
||||
document.getElementById('button-'+id).innerHTML = 'Hide';
|
||||
/*
|
||||
if (document.getElementById('section-'+id).innerHTML.replace(/^\s+|\s+$/g,'') == '') {
|
||||
document.getElementById('section-'+id).innerHTML = 'No content yet';
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
</script>
|
||||
<style type="text/css">
|
||||
body {
|
||||
background:#ccc;
|
||||
margin:0;
|
||||
font-family:helvetica;
|
||||
-webkit-text-size-adjust:none;
|
||||
}
|
||||
form {
|
||||
margin:0;
|
||||
}
|
||||
div#content {
|
||||
margin:6px;
|
||||
padding:6px;
|
||||
border:1px solid #777;
|
||||
background-color:#fff;
|
||||
-webkit-border-radius:6px;
|
||||
-moz-border-radius:6px;
|
||||
-webkit-box-shadow:rgba(0,0,0,.3) 1px 1px 3px;
|
||||
font-size:0.9em;
|
||||
line-height:1.4em;
|
||||
}
|
||||
div#content h1, div#content h2 {
|
||||
margin:0;
|
||||
border-bottom:solid 1px #aaa;
|
||||
font-size:1.7em;
|
||||
line-height:1.4em;
|
||||
clear:both;
|
||||
overflow:auto;
|
||||
}
|
||||
div#content h2 {
|
||||
font-size:22px;
|
||||
margin-top:12px;
|
||||
}
|
||||
div#content h2 button {
|
||||
float:right;
|
||||
}
|
||||
div#bodyContent > div {
|
||||
margin:6px 0;
|
||||
}
|
||||
div {
|
||||
clear:both;
|
||||
}
|
||||
div#siteNotice, div.printfooter, div.magnify {
|
||||
display:none;
|
||||
}
|
||||
div#p-toc {
|
||||
display:none;
|
||||
}
|
||||
span.subpages {
|
||||
display:block;
|
||||
background-color:#e6e6e6;
|
||||
padding:8px;
|
||||
-webkit-border-radius:6px;
|
||||
-moz-border-radius:6px;
|
||||
}
|
||||
ul.wt-toc {
|
||||
list-style:none;
|
||||
margin:10px 0;
|
||||
padding:0;
|
||||
}
|
||||
ul.wt-toc ul {
|
||||
margin:0 18px;
|
||||
}
|
||||
ul.wt-toc-compact {
|
||||
display:none;
|
||||
}
|
||||
img, object {
|
||||
border:none;
|
||||
max-width:280px;
|
||||
height:auto;
|
||||
}
|
||||
ul {
|
||||
margin:10px 0px 10px -18px;
|
||||
}
|
||||
div.thumbinner {
|
||||
padding:6px;
|
||||
margin:6px 0 0 0;
|
||||
border:1px solid #777;
|
||||
background-color:#e6e6e6;
|
||||
-webkit-border-radius:6px;
|
||||
-moz-border-radius:6px;
|
||||
-webkit-box-shadow:rgba(0,0,0,.3) 1px 1px 3px;
|
||||
font-size:12px;
|
||||
display:table;
|
||||
}
|
||||
div.loadHide {
|
||||
display:none;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
22
crawler/wikitravel-optimize-articles.py
Executable file
22
crawler/wikitravel-optimize-articles.py
Executable file
|
@ -0,0 +1,22 @@
|
|||
#!/opt/local/bin/python
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
|
||||
myPath = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
(url, title, fileBase) = json.loads(line)
|
||||
fileName = fileBase + '.article'
|
||||
outFileName = fileName + '.opt'
|
||||
if os.path.exists(outFileName):
|
||||
sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
|
||||
else:
|
||||
sys.stderr.write('Optimizing {0} {1}\n'.format(i, fileName))
|
||||
assert 0 == os.system('java -jar {myPath}/htmlcompressor.jar '
|
||||
'--remove-intertag-spaces --simple-bool-attr --remove-quotes '
|
||||
'--remove-js-protocol --type html '
|
||||
'-o {outFileName} {fileName}'
|
||||
.format(myPath = myPath, fileName = fileName, outFileName = outFileName))
|
63
crawler/wikitravel-process-articles.py
Executable file
63
crawler/wikitravel-process-articles.py
Executable file
|
@ -0,0 +1,63 @@
|
|||
#!/opt/local/bin/python
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
|
||||
def RemoveEmptyTags(soup):
|
||||
# Removing free tags can make other tags free, so we do it several times in a loop.
|
||||
for i in range(1, 5):
|
||||
[x.extract() for x in soup.findAll(lambda tag: tag.name in ['p', 'div', 'h2']
|
||||
and tag.find(True) is None
|
||||
and (tag.string is None or tag.string.strip() == ''))]
|
||||
|
||||
def ProcessArticle(article):
|
||||
soup = BeautifulSoup(article)
|
||||
[x.extract() for x in soup.findAll(id = 'top1')]
|
||||
[x.extract() for x in soup.findAll(id = 'toolbar_top')]
|
||||
[x.extract() for x in soup.findAll(id = 'siteNotice')]
|
||||
[x.extract() for x in soup.findAll(id = 'p-toc')]
|
||||
[x.extract() for x in soup.findAll(id = 'catlinks')]
|
||||
[x.extract() for x in soup.findAll('div', 'search-container')]
|
||||
[x.extract() for x in soup.findAll('div', 'printfooter')]
|
||||
[x.extract() for x in soup.findAll('div', 'visualClear')]
|
||||
[x.extract() for x in soup.findAll('script')]
|
||||
[x.extract() for x in soup.findAll('ul', 'individual')]
|
||||
|
||||
for notice in soup.findAll('a', href='http://m.wikitravel.org/en/Wikitravel:Plunge_forward'):
|
||||
noticeDiv = notice.findParent('div')
|
||||
if noticeDiv:
|
||||
noticeDiv.extract()
|
||||
|
||||
# Remove empty tags. This is especially needed for Get_out section, since it containts the footer.
|
||||
RemoveEmptyTags(soup)
|
||||
sections = [tag['id'][8:] for tag in soup.findAll(id = re.compile('section-.*'))]
|
||||
for section in sections:
|
||||
if soup.find(id = 'section-' + section) is None:
|
||||
[x.extract() for x in soup.find(id = 'button-' + section).findParent('h2')]
|
||||
RemoveEmptyTags(soup)
|
||||
|
||||
s = str(soup)
|
||||
s = s.replace('toggleShowHide', 'tg')
|
||||
s = re.search('<body>(.*)</body>', s, re.UNICODE | re.MULTILINE | re.DOTALL).group(1)
|
||||
return s
|
||||
|
||||
for i, line in enumerate(sys.stdin):
|
||||
(url, title, fileName) = json.loads(line)
|
||||
outFileName = fileName + '.article'
|
||||
if os.path.exists(outFileName):
|
||||
sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
|
||||
else:
|
||||
sys.stderr.write('Processing {0} {1}\n'.format(i, fileName))
|
||||
fin = open(fileName, 'r')
|
||||
article = ProcessArticle(fin.read())
|
||||
fin.close()
|
||||
|
||||
fout = open(outFileName, 'w')
|
||||
fout.write(article)
|
||||
fout.close()
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/python
|
||||
#!/opt/local/bin/python
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
|
Loading…
Add table
Reference in a new issue