[crawler] Major update. Changed python location, added article processor (wikitravel-process-articles.py) and optimizer (htmlcompressor.jar and wikitravel-optimize-articles.py).

2011-04-04 20:57:38 +02:00 · 2011-04-04 20:57:38 +02:00 · 3ca8c59cd4
commit 3ca8c59cd4
parent ff1d7a5a82
8 changed files with 220 additions and 14 deletions
--- a/crawler/htmlcompressor.jar
+++ b/crawler/htmlcompressor.jar
--- a/crawler/wikitravel-crawler.sh
+++ b/crawler/wikitravel-crawler.sh
@ -2,23 +2,31 @@
 set -e -u -x
 MY_PATH=`dirname $0`

-$MY_PATH/wikitravel-download-lists.sh
+# $MY_PATH/wikitravel-download-lists.sh

-cat wikitravel-redirects-*.html \
-    | $MY_PATH/wikitravel-process-redirects.py \
-    | grep -v Diving_the_Cape_Peninsula \
-    | grep -v '[^\s]*:' \
-    > wikitravel-redirects.json
+# cat wikitravel-redirects-*.html \
+#     | $MY_PATH/wikitravel-process-redirects.py \
+#     | grep -v Diving_the_Cape_Peninsula \
+#     | grep -v '[^\s]*:' \
+#     > wikitravel-redirects.json

-cat wikitravel-pages-*.html \
-    | $MY_PATH/wikitravel-process-pages.py \
-    | grep -v Diving_the_Cape_Peninsula \
-    > wikitravel-pages.json
+# cat wikitravel-pages-*.html \
+#     | $MY_PATH/wikitravel-process-pages.py \
+#     | grep -v Diving_the_Cape_Peninsula \
+#     > wikitravel-pages.json

-wc -l wikitravel-pages.json
+# wc -l wikitravel-pages.json

 cat wikitravel-pages.json | $MY_PATH/wikitravel-download-pages.py

-# TODO: Strip articles
+cat wikitravel-pages.json | $MY_PATH/wikitravel-process-articles.py
+
+cat wikitravel-pages.json | $MY_PATH/wikitravel-optimize-articles.py
+
+#for file in *.article
+#do
+#java -jar $MY_PATH/htmlcompressor.jar --remove-intertag-spaces --simple-bool-attr --remove-quotes \
+#  --remove-js-protocol --type html -o "${file}.opt" "${file}"
+#done

 # TODO: Run publisher.
--- a/crawler/wikitravel-download-pages.py
+++ b/crawler/wikitravel-download-pages.py
@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/opt/local/bin/python
 import json
 import os.path
 import sys
--- a/crawler/wikitravel-footer.html
+++ b/crawler/wikitravel-footer.html
@ -0,0 +1,2 @@
+  </body>
+</html>
--- a/crawler/wikitravel-header.html
+++ b/crawler/wikitravel-header.html
@ -0,0 +1,111 @@
+<html>
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+    <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/>
+    <script type="text/javascript">
+		function tg(id) {
+			if (document.getElementById('section-'+id).style.display == 'block') {
+				document.getElementById('section-'+id).style.display = 'none';
+				document.getElementById('button-'+id).innerHTML = 'Show';
+			} else {
+				document.getElementById('section-'+id).style.display = 'block';
+				document.getElementById('button-'+id).innerHTML = 'Hide';
+        /*
+				if (document.getElementById('section-'+id).innerHTML.replace(/^\s+|\s+$/g,'') == '') {
+					document.getElementById('section-'+id).innerHTML = 'No content yet';
+				}
+        */
+			}
+		}
+    </script>
+    <style type="text/css">
+    body {
+      background:#ccc;
+      margin:0;
+      font-family:helvetica;
+      -webkit-text-size-adjust:none;
+    }
+    form {
+        margin:0;
+    }
+    div#content {
+        margin:6px;
+        padding:6px;
+        border:1px solid #777;
+        background-color:#fff;
+        -webkit-border-radius:6px;
+        -moz-border-radius:6px;
+        -webkit-box-shadow:rgba(0,0,0,.3) 1px 1px 3px;
+        font-size:0.9em;
+        line-height:1.4em;
+    }
+    div#content h1, div#content h2 {
+        margin:0;
+        border-bottom:solid 1px #aaa;
+        font-size:1.7em;
+        line-height:1.4em;
+        clear:both;
+        overflow:auto;
+    }
+    div#content h2 {
+        font-size:22px;
+        margin-top:12px;
+    }
+    div#content h2 button {
+        float:right;
+    }
+    div#bodyContent > div {
+        margin:6px 0;
+    }
+    div {
+        clear:both;
+    }
+    div#siteNotice, div.printfooter, div.magnify {
+        display:none;
+    }
+    div#p-toc {
+        display:none;
+    }
+    span.subpages {
+        display:block;
+        background-color:#e6e6e6;
+        padding:8px;
+        -webkit-border-radius:6px;
+        -moz-border-radius:6px;
+    }
+    ul.wt-toc {
+        list-style:none;
+        margin:10px 0;
+        padding:0;
+    }
+    ul.wt-toc ul {
+        margin:0 18px;
+    }
+    ul.wt-toc-compact {
+        display:none;
+    }
+    img, object {
+        border:none;
+        max-width:280px;
+        height:auto;
+    }
+    ul {
+        margin:10px 0px 10px -18px;
+    }
+    div.thumbinner {
+        padding:6px;
+        margin:6px 0 0 0;
+        border:1px solid #777;
+        background-color:#e6e6e6;
+        -webkit-border-radius:6px;
+        -moz-border-radius:6px;
+        -webkit-box-shadow:rgba(0,0,0,.3) 1px 1px 3px;
+        font-size:12px;
+        display:table;
+    }
+    div.loadHide {
+        display:none;
+    }
+    </style>
+  </head>
+  <body>
--- a/crawler/wikitravel-optimize-articles.py
+++ b/crawler/wikitravel-optimize-articles.py
@ -0,0 +1,22 @@
+#!/opt/local/bin/python
+import json
+import os
+import re
+import string
+import sys
+
+myPath = os.path.dirname(os.path.realpath(__file__))
+
+for i, line in enumerate(sys.stdin):
+  (url, title, fileBase) = json.loads(line)
+  fileName = fileBase + '.article'
+  outFileName = fileName + '.opt'
+  if os.path.exists(outFileName):
+    sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
+  else:
+    sys.stderr.write('Optimizing {0} {1}\n'.format(i, fileName))
+    assert 0 == os.system('java -jar {myPath}/htmlcompressor.jar '
+                 '--remove-intertag-spaces --simple-bool-attr --remove-quotes '
+                  '--remove-js-protocol --type html '
+                '-o {outFileName} {fileName}'
+                 .format(myPath = myPath, fileName = fileName, outFileName = outFileName))
--- a/crawler/wikitravel-process-articles.py
+++ b/crawler/wikitravel-process-articles.py
@ -0,0 +1,63 @@
+#!/opt/local/bin/python
+import hashlib
+import json
+import os
+import re
+import string
+import sys
+from BeautifulSoup import BeautifulSoup
+
+def RemoveEmptyTags(soup):
+  # Removing free tags can make other tags free, so we do it several times in a loop.
+  for i in range(1, 5):
+    [x.extract() for x in soup.findAll(lambda tag: tag.name in ['p', 'div', 'h2']
+                                       and tag.find(True) is None
+                                       and (tag.string is None or tag.string.strip() == ''))]
+    
+def ProcessArticle(article):
+  soup = BeautifulSoup(article)
+  [x.extract() for x in soup.findAll(id = 'top1')]
+  [x.extract() for x in soup.findAll(id = 'toolbar_top')]
+  [x.extract() for x in soup.findAll(id = 'siteNotice')]
+  [x.extract() for x in soup.findAll(id = 'p-toc')]
+  [x.extract() for x in soup.findAll(id = 'catlinks')]
+  [x.extract() for x in soup.findAll('div', 'search-container')]
+  [x.extract() for x in soup.findAll('div', 'printfooter')]
+  [x.extract() for x in soup.findAll('div', 'visualClear')]
+  [x.extract() for x in soup.findAll('script')]
+  [x.extract() for x in soup.findAll('ul', 'individual')]
+  
+  for notice in soup.findAll('a', href='http://m.wikitravel.org/en/Wikitravel:Plunge_forward'):
+    noticeDiv = notice.findParent('div')
+    if noticeDiv:
+      noticeDiv.extract()
+
+  # Remove empty tags. This is especially needed for Get_out section, since it containts the footer.
+  RemoveEmptyTags(soup)
+  sections = [tag['id'][8:] for tag in soup.findAll(id = re.compile('section-.*'))]
+  for section in sections:
+    if soup.find(id = 'section-' + section) is None:
+      [x.extract() for x in soup.find(id = 'button-' + section).findParent('h2')]
+  RemoveEmptyTags(soup)
+
+  s = str(soup)
+  s = s.replace('toggleShowHide', 'tg')
+  s = re.search('<body>(.*)</body>', s, re.UNICODE | re.MULTILINE | re.DOTALL).group(1)
+  return s
+
+for i, line in enumerate(sys.stdin):
+  (url, title, fileName) = json.loads(line)
+  outFileName = fileName + '.article'
+  if os.path.exists(outFileName):
+    sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
+  else:
+    sys.stderr.write('Processing {0} {1}\n'.format(i, fileName))
+    fin = open(fileName, 'r')
+    article = ProcessArticle(fin.read())
+    fin.close()
+
+    fout = open(outFileName, 'w')
+    fout.write(article)
+    fout.close()
+
+
--- a/crawler/wikitravel-process-redirects.py
+++ b/crawler/wikitravel-process-redirects.py
@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/opt/local/bin/python
 import json
 import re
 import sys