diff --git a/builder/htmlprocessor/processor.sh b/builder/htmlprocessor/processor.sh new file mode 100644 index 0000000..5fb0240 --- /dev/null +++ b/builder/htmlprocessor/processor.sh @@ -0,0 +1,5 @@ + +for interval in 0 1 2 3 4 5 6 7; + do python strip.py $1 $2 $3 $interval 8 + done + \ No newline at end of file diff --git a/builder/htmlprocessor/strip.py b/builder/htmlprocessor/strip.py index 73d6a53..6e1d224 100644 --- a/builder/htmlprocessor/strip.py +++ b/builder/htmlprocessor/strip.py @@ -1,46 +1,87 @@ import sys +import os +import urllib -if len(sys.argv) == 1: - print "Usage: " + sys.argv[0] + " [optional output file]" - exit(1) +reload(sys) +sys.setdefaultencoding('utf-8') from bs4 import BeautifulSoup -soup = BeautifulSoup(open(sys.argv[1])) -content = soup.find("div", {"id": "content"}) +def cleanUp(soup): + content = soup.find("div", {"id": "content"}) -# remove all specified tags -[s.decompose() for s in content(['noscript'])] + # remove all specified tags + [s.decompose() for s in content(['noscript'])] -[s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})] -[s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})] -[s.decompose() for s in content.findAll("table", {"class": "articleState"})] -[s.decompose() for s in content.findAll("button", {"class": "languageSelector"})] -[s.decompose() for s in content.findAll("a", {"class": "section_anchors"})] -[s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})] + [s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})] + [s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})] + [s.decompose() for s in content.findAll("table", {"class": "articleState"})] + [s.decompose() for s in content.findAll("button", {"class": "languageSelector"})] + [s.decompose() for s in content.findAll("a", {"class": "section_anchors"})] + [s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})] -# Wrap content with our own header and body -content = content.wrap(soup.new_tag("body")) -content = content.wrap(soup.new_tag("html")) -# Here we add our own js and css into the
-headTag = soup.new_tag("head") -cType = soup.new_tag("meta", content="text/html; charset=UTF-8") -# workaround as we can't use dashes in python names -cType["http-equiv"] = "Content-Type" -headTag.append(cType) -headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="article.css")) -headTag.append(soup.new_tag("script", type="text/javascript", href="article.js")) -meta1 = soup.new_tag("meta", content="yes") -# workaround as "name" is used in python -meta1["name"] = "apple-mobile-web-app-capable" -headTag.append(meta1) -meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6") -meta2["name"] = "viewport" -headTag.append(meta2) -content.body.insert_before(headTag) + # Wrap content with our own header and body + content = content.wrap(soup.new_tag("body")) + content = content.wrap(soup.new_tag("html")) + # Here we add our own js and css into the + headTag = soup.new_tag("head") + cType = soup.new_tag("meta", content="text/html; charset=UTF-8") + # workaround as we can't use dashes in python names + cType["http-equiv"] = "Content-Type" + headTag.append(cType) + headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="article.css")) + headTag.append(soup.new_tag("script", type="text/javascript", href="article.js")) + meta1 = soup.new_tag("meta", content="yes") + # workaround as "name" is used in python + meta1["name"] = "apple-mobile-web-app-capable" + headTag.append(meta1) + meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6") + meta2["name"] = "viewport" + headTag.append(meta2) + content.body.insert_before(headTag) + return content + +def imageExist(fileName): + global imageFiles + return urllib.unquote(fileName).lower() in imageFiles -if len(sys.argv) == 3: - open(sys.argv[2], "w").write(content.prettify().encode('utf-8')) -else: - print(content.prettify().encode('utf-8')) +def rewriteImages(soup): + imgTag = soup.findAll("img"); + + for imgElement in imgTag: + del imgElement['alt'] + #todo rewrite srcset attr if we can get callback on image loading in webview + del imgElement['srcset'] + + index = -1 + if "thumb" in imgElement['src'] and not '.pdf' in imgElement['src'].split("/")[-2]: + index = -2 + imageName = imgElement['src'].split("/")[index] + if imageExist(imageName): + imgElement['src'] = "images/" + imageName + else: + [s.decompose() for s in imgElement.fetchParents("div", {"class" : "thumb tright"})] + +def writeHtml(content, fileName): + global outDir + open(os.path.join(outDir, fileName), "w").write(content.prettify().encode('utf-8')) + +############################################################################## +if len(sys.argv) < 6: + print "Usage: " + sys.argv[0] + "