image links rewrite
This commit is contained in:
parent
9c7f09ce6b
commit
5ed265561e
2 changed files with 82 additions and 36 deletions
5
builder/htmlprocessor/processor.sh
Normal file
5
builder/htmlprocessor/processor.sh
Normal file
|
@ -0,0 +1,5 @@
|
|||
|
||||
for interval in 0 1 2 3 4 5 6 7;
|
||||
do python strip.py $1 $2 $3 $interval 8
|
||||
done
|
||||
|
|
@ -1,46 +1,87 @@
|
|||
import sys
|
||||
import os
|
||||
import urllib
|
||||
|
||||
if len(sys.argv) == 1:
|
||||
print "Usage: " + sys.argv[0] + " <html article file> [optional output file]"
|
||||
exit(1)
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(open(sys.argv[1]))
|
||||
content = soup.find("div", {"id": "content"})
|
||||
def cleanUp(soup):
|
||||
content = soup.find("div", {"id": "content"})
|
||||
|
||||
# remove all specified tags
|
||||
[s.decompose() for s in content(['noscript'])]
|
||||
# remove all specified tags
|
||||
[s.decompose() for s in content(['noscript'])]
|
||||
|
||||
[s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})]
|
||||
[s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})]
|
||||
[s.decompose() for s in content.findAll("table", {"class": "articleState"})]
|
||||
[s.decompose() for s in content.findAll("button", {"class": "languageSelector"})]
|
||||
[s.decompose() for s in content.findAll("a", {"class": "section_anchors"})]
|
||||
[s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})]
|
||||
[s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})]
|
||||
[s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})]
|
||||
[s.decompose() for s in content.findAll("table", {"class": "articleState"})]
|
||||
[s.decompose() for s in content.findAll("button", {"class": "languageSelector"})]
|
||||
[s.decompose() for s in content.findAll("a", {"class": "section_anchors"})]
|
||||
[s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})]
|
||||
|
||||
# Wrap content with our own header and body
|
||||
content = content.wrap(soup.new_tag("body"))
|
||||
content = content.wrap(soup.new_tag("html"))
|
||||
# Here we add our own js and css into the <head>
|
||||
headTag = soup.new_tag("head")
|
||||
cType = soup.new_tag("meta", content="text/html; charset=UTF-8")
|
||||
# workaround as we can't use dashes in python names
|
||||
cType["http-equiv"] = "Content-Type"
|
||||
headTag.append(cType)
|
||||
headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="article.css"))
|
||||
headTag.append(soup.new_tag("script", type="text/javascript", href="article.js"))
|
||||
meta1 = soup.new_tag("meta", content="yes")
|
||||
# workaround as "name" is used in python
|
||||
meta1["name"] = "apple-mobile-web-app-capable"
|
||||
headTag.append(meta1)
|
||||
meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6")
|
||||
meta2["name"] = "viewport"
|
||||
headTag.append(meta2)
|
||||
content.body.insert_before(headTag)
|
||||
# Wrap content with our own header and body
|
||||
content = content.wrap(soup.new_tag("body"))
|
||||
content = content.wrap(soup.new_tag("html"))
|
||||
# Here we add our own js and css into the <head>
|
||||
headTag = soup.new_tag("head")
|
||||
cType = soup.new_tag("meta", content="text/html; charset=UTF-8")
|
||||
# workaround as we can't use dashes in python names
|
||||
cType["http-equiv"] = "Content-Type"
|
||||
headTag.append(cType)
|
||||
headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="article.css"))
|
||||
headTag.append(soup.new_tag("script", type="text/javascript", href="article.js"))
|
||||
meta1 = soup.new_tag("meta", content="yes")
|
||||
# workaround as "name" is used in python
|
||||
meta1["name"] = "apple-mobile-web-app-capable"
|
||||
headTag.append(meta1)
|
||||
meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6")
|
||||
meta2["name"] = "viewport"
|
||||
headTag.append(meta2)
|
||||
content.body.insert_before(headTag)
|
||||
return content
|
||||
|
||||
def imageExist(fileName):
|
||||
global imageFiles
|
||||
return urllib.unquote(fileName).lower() in imageFiles
|
||||
|
||||
|
||||
if len(sys.argv) == 3:
|
||||
open(sys.argv[2], "w").write(content.prettify().encode('utf-8'))
|
||||
else:
|
||||
print(content.prettify().encode('utf-8'))
|
||||
def rewriteImages(soup):
|
||||
imgTag = soup.findAll("img");
|
||||
|
||||
for imgElement in imgTag:
|
||||
del imgElement['alt']
|
||||
#todo rewrite srcset attr if we can get callback on image loading in webview
|
||||
del imgElement['srcset']
|
||||
|
||||
index = -1
|
||||
if "thumb" in imgElement['src'] and not '.pdf' in imgElement['src'].split("/")[-2]:
|
||||
index = -2
|
||||
imageName = imgElement['src'].split("/")[index]
|
||||
if imageExist(imageName):
|
||||
imgElement['src'] = "images/" + imageName
|
||||
else:
|
||||
[s.decompose() for s in imgElement.fetchParents("div", {"class" : "thumb tright"})]
|
||||
|
||||
def writeHtml(content, fileName):
|
||||
global outDir
|
||||
open(os.path.join(outDir, fileName), "w").write(content.prettify().encode('utf-8'))
|
||||
|
||||
##############################################################################
|
||||
if len(sys.argv) < 6:
|
||||
print "Usage: " + sys.argv[0] + " <directory with html articles> <images directory> <output directory> <threadIndex> <cpu core count>"
|
||||
exit(1)
|
||||
|
||||
inDir = sys.argv[1]
|
||||
imageFiles = [unicode((urllib.unquote(file)).lower()) for file in os.listdir(sys.argv[2])]
|
||||
outDir = sys.argv[3]
|
||||
threadIndex = int(sys.argv[4])
|
||||
coreCount = int(sys.argv[5])
|
||||
files = [urllib.unquote(file) for file in os.listdir(sys.argv[1])]
|
||||
thisFiles = files[threadIndex * len(files) / coreCount : (threadIndex + 1) * len(files) / coreCount]
|
||||
|
||||
for file in thisFiles:
|
||||
soup = BeautifulSoup(open(os.path.join(inDir, file)))
|
||||
soup = cleanUp(soup)
|
||||
rewriteImages(soup)
|
||||
writeHtml(soup, file)
|
Reference in a new issue