Merge pull request #4 from Komzpa/generator

Generator
This commit is contained in:
deathbaba 2013-08-13 08:44:34 -07:00
commit 0d4b1ba779
5 changed files with 254 additions and 176 deletions

View file

@ -0,0 +1,25 @@
#!/bin/bash
if test "$1" == "" ; then
echo "USAGE: $0 [thumbsInDir] [imagesInDir] [outDir]"
exit
fi
outdir = $3
mkdir -p $3
mkdir -p $3/thumb
pushd $1
for i in *.png; do convert $i -auto-orient -quality 53 -thumbnail '256x256>' $outdir/thumb/$(basename -s.png $i).jpg; echo $i; done
for i in *.svg; do convert $i -auto-orient -quality 53 -thumbnail '256x256>' $outdir/thumb/$(basename -s.svg $i).jpg; echo $i; done
for i in *.jpg *.JPG *.jpeg; do convert -define jpeg:size=400x280 $i -auto-orient -quality 53 -thumbnail '500x280>' -strip -liquid-rescale '256x256!>' $outdir/thumb/$i; echo $i; done
popd
mkdir -p $3/images
pushd $2
for i in *.jpg; do convert $i -auto-orient -quality 53 -strip -thumbnail '1536x1536>' $outdir/images/$i; echo $i; done
for i in *.png; do convert $i -auto-orient -quality 99 -strip -thumbnail '4000x3000>' PNG8:$outdir/images/$i; echo $i; done
cp *.svg $outdir/images/
popd

View file

@ -4,7 +4,10 @@ import os
import urllib
import shutil
import unicodedata
from bs4 import BeautifulSoup
try:
from bs4 import BeautifulSoup
except ImportError:
import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf-8')
@ -13,166 +16,184 @@ from strip_function import cleanUp
def insertMapLink(soup, lat, lon, title, pageId):
hrefLink = "mapswithme://map?v=1&ll=%s,%s&n=%s&id=%s&backurl=guideswithme&appname=Guides%%20With%%20Me"%(lat, lon, urllib.quote(title), pageId)
mapTag = BeautifulSoup().new_tag("a", href=hrefLink)
mapTag["class"] = "geolink";
soup.body.insert(0, mapTag)
hrefLink = "mapswithme://map?v=1&ll=%s,%s&n=%s&id=%s&backurl=guideswithme&appname=Guides%%20With%%20Me" % (lat, lon, urllib.quote(title), pageId)
mapTag = BeautifulSoup().new_tag("a", href=hrefLink)
mapTag["class"] = "geolink"
soup.body.insert(0, mapTag)
def insertArticleTitle(soup, articleTitle):
titleTag = BeautifulSoup().new_tag("div")
titleTag["class"] = "articleHeader"
titleTag.append(articleTitle)
soup.body.insert(0, titleTag)
titleTag = BeautifulSoup().new_tag("div")
titleTag["class"] = "articleHeader"
titleTag.append(articleTitle)
soup.body.insert(0, titleTag)
def insertArticleImage(soup, imagePath):
imgTag = BeautifulSoup().new_tag("img", src=imagePath)
imgTag["class"] = "articleImage"
soup.body.insert(0, imgTag)
imgTag = BeautifulSoup().new_tag("img", style="background-image: url('%s')" % imagePath, id="articleImage")
imgTag["class"] = "articleImage"
soup.body.insert(0, imgTag)
def insertBreadcrumb(soup, articleTitle, parentTitle, parentLink, grandParentTitle, grandParentLink):
tagFactory = BeautifulSoup()
tagFactory = BeautifulSoup()
bcWrapper = tagFactory.new_tag("div")
bcWrapper["class"] = "breadcrumbs_wrapper"
if (grandParentTitle):
grandParentTag = tagFactory.new_tag("a", href=grandParentLink)
grandParentTag["class"] = "breadcrumb bc1"
grandParentTag.append(grandParentTitle)
bcWrapper.append(grandParentTag)
if (parentTitle):
parentTag = tagFactory.new_tag("a", href=parentLink)
parentTag["class"] = "breadcrumb bc2"
parentTag.append(parentTitle)
bcWrapper.append(parentTag)
currTag = tagFactory.new_tag("span")
currTag["class"] = "breadcrumb bc3"
currTag.append(articleTitle)
bcWrapper.append(currTag)
soup.body.insert(0, bcWrapper)
bcWrapper = tagFactory.new_tag("div")
bcWrapper["class"] = "breadcrumbs_wrapper"
if (grandParentTitle):
grandParentTag = tagFactory.new_tag("a", href=grandParentLink)
grandParentTag["class"] = "breadcrumb bc1"
grandParentTag.append(grandParentTitle)
bcWrapper.append(grandParentTag)
if (parentTitle):
parentTag = tagFactory.new_tag("a", href=parentLink)
parentTag["class"] = "breadcrumb bc2"
parentTag.append(parentTitle)
bcWrapper.append(parentTag)
currTag = tagFactory.new_tag("span")
currTag["class"] = "breadcrumb bc3"
currTag.append(articleTitle)
bcWrapper.append(currTag)
soup.body.insert(0, bcWrapper)
def transformStringWithEncoding(str):
return urllib.unquote(str.decode("latin-1").encode("utf-8"))
def transformStringWithEncoding(string):
return urllib.unquote(string.decode("latin-1").encode("utf-8"))
def transformString(s):
unquoted = urllib.unquote(str(s));
for i in u"\"',/\\@#$%^&*()!~`«»":
unquoted = unquoted.replace(i, "_")
return unicode(unquoted.strip("_"))
def formatToNFKD(s):
return unicodedata.normalize("NFKD", transformString(s))
def unicodeNormalize(s):
return (u"".join( x for x in formatToNFKD(s) if not unicodedata.category(x).startswith("M"))).lower()
def sanitizeFileName(s):
# unquote %20 and like that
s = urllib.unquote(str(s))
def imageExist(fileName):
global imageFiles
global imageSet
unquotedName = unicodeNormalize(fileName)
if unquotedName in imageFiles:
imageSet.add(unquotedName)
return True
# remove punctuation
for i in u"\"',/\\@#$%^&*()!~`«»":
s = s.replace(i, "_")
s = s.strip("_")
return False
# normalize unicode to NFKD (form with modifiers separated)
s = unicodedata.normalize("NFKD", s)
# drop modifiers from string (no diacritics)
s = u"".join(x for x in s if not unicodedata.category(x).startswith("M"))
# lowercase
s = s.lower()
return s
def imageSanitizedPath(fileName):
"""
return path to image file if it's in dataset
"""
global imageFiles
global imageSet
unquotedName = sanitizeFileName(fileName)
if unquotedName in imageFiles:
imageSet.add(unquotedName)
return 'images/' + unquotedName
def rewriteImages(soup):
imgTag = soup.findAll("img");
imgTag = soup.findAll("img")
for imgElement in imgTag:
del imgElement["alt"]
#todo rewrite srcset attr if we can get callback on image loading in webview
del imgElement["srcset"]
for imgElement in imgTag:
del imgElement["alt"]
# todo rewrite srcset attr if we can get callback on image loading in webview
del imgElement["srcset"]
index = -1
splitSrc = imgElement["src"].split("/")
splitSrc.reverse()
# checking just two last elements (preview name, real name)
for fileName in splitSrc[:2]:
fileName = imageSanitizedPath(fileName)
if fileName:
imgElement["src"] = fileName
break
else:
print "Stripping image", imgElement["src"]
[s.decompose() for s in imgElement.fetchParents("div", {"class": ["thumb tright", "thumbinner", "image"]})]
index = -1
srcPath = imgElement["src"]
splitedSrc = srcPath.split("/")
if imageExist(splitedSrc[-1]):
imgElement['src'] = "images/" + unicodeNormalize(splitedSrc[-1])
elif imageExist(splitedSrc[-2]):
imgElement['src'] = "images/" + unicodeNormalize(splitedSrc[-2])
else:
print "Image strip = " + unicodeNormalize(splitedSrc[-2])
[s.decompose() for s in imgElement.fetchParents("div", {"class" : ["thumb tright", "thumbinner", "image"]})]
def rewriteCrossLinks(soup):
global idMapping
global redirectMapping
links = soup.findAll("a")
global idMapping
global redirectMapping
links = soup.findAll("a")
for link in links:
destTitle = link["href"].split("/",2)[-1]
destTitle = transformStringWithEncoding(destTitle)
destTitle = redirectMapping.get(destTitle, destTitle);
for link in links:
destTitle = link["href"].split("/", 2)[-1]
destTitle = transformStringWithEncoding(destTitle)
destTitle = redirectMapping.get(destTitle, destTitle)
if destTitle in idMapping:
link["href"] = idMapping.get(destTitle, link["href"]) + ".html"
continue
if destTitle in idMapping:
link["href"] = idMapping.get(destTitle, link["href"]) + ".html"
continue
if "/wiki/File:" in link["href"] and "http" not in link["href"] and "www" not in link["href"]:
imgElement = link.find("img")
if imgElement:
link["href"] = imgElement["src"]
continue
if "/wiki/File:" in link["href"] and "http" not in link["href"] and "www" not in link["href"]:
imgElement = link.find("img")
if imgElement:
link["href"] = imgElement["src"]
continue
if "/wiki/" in link["href"]:
if link.string:
link.replace_with(link.string)
else:
link.replace_with("")
if "/wiki/" in link["href"]:
if link.string:
link.replace_with(link.string)
else:
link.replace_with("")
def writeHtml(content, fileName):
global outDir
open(os.path.join(outDir, fileName + ".html"), "w").write(content.encode('utf-8'))
global outDir
open(os.path.join(outDir, fileName + ".html"), "w").write(content.encode('utf-8'))
def fixTitle(title):
return title.split('/')[-1].replace('_', ' ')
return title.split('/')[-1].replace('_', ' ')
##############################################################################
if len(sys.argv) < 9:
print "Usage: " + sys.argv[0] + " <directory with html articles> <images directory> <article set info file> <redirect info file> <geocoords file> <output directory> <threadIndex> <cpu core count>"
exit(1)
print "Usage: " + sys.argv[0] + " <directory with html articles> <images directory> <article set info file> <redirect info file> <geocoords file> <output directory> <threadIndex> <cpu core count>"
exit(1)
inDir = sys.argv[1]
imagesSrcDir = sys.argv[2]
imageFiles = dict([(unicodeNormalize(file), file) for file in os.listdir(imagesSrcDir)])
idMapping = dict([(unicode(i.split("\t")[1]), unicode(i.split("\t")[0])) for i in open(sys.argv[3])])
imageFiles = dict([(sanitizeFileName(file), file) for file in os.listdir(imagesSrcDir)])
idMappingFile = sys.argv[3]
idMapping = dict([(unicode(i.split("\t")[1]), unicode(i.split("\t")[0])) for i in open(idMappingFile)])
articleImages = dict([(i.split("\t")[0], i.strip().split("\t")[3]) for i in open(sys.argv[3])])
# pageId => [parentId, parentTitle, grandParentId, grandParentTitle], ids and titles can be "NULL"
ancestors = dict([(i.split("\t")[0], i.strip().split("\t")[4:8]) for i in open(sys.argv[3])])
redirectMapping = dict([(unicode(line.split("\t")[1]), unicode(line.split("\t")[3].strip())) for line in open(sys.argv[4])])
coords = dict([(line.split("\t")[0], (line.split("\t")[1], line.split("\t")[2])) for line in open(sys.argv[5])])
pageIdToTitle = {}
for key, value in idMapping.iteritems():
if value in coords:
pageIdToTitle[value] = fixTitle(str(key))
pageIdToTitle = {v: fixTitle(str(k)) for k, v in idMapping.iteritems()}
outDir = sys.argv[6]
threadIndex = int(sys.argv[7])
coreCount = int(sys.argv[8])
files = [urllib.unquote(file) for file in idMapping.values()]
thisFiles = files[threadIndex * len(files) / coreCount : (threadIndex + 1) * len(files) / coreCount]
thisFiles = files[threadIndex * len(files) / coreCount: (threadIndex + 1) * len(files) / coreCount]
imageSet = set()
# preload coords
if not os.path.exists(outDir):
os.makedirs(outDir)
os.makedirs(outDir)
for file in thisFiles:
soup = BeautifulSoup(open(os.path.join(inDir, file)))
soup = cleanUp(soup)
rewriteImages(soup)
rewriteCrossLinks(soup)
# insert article "header" - image with breadcrumbs and map link
if file in coords:
soup = BeautifulSoup(open(os.path.join(inDir, file)))
soup = cleanUp(soup)
rewriteImages(soup)
rewriteCrossLinks(soup)
articleTitle = pageIdToTitle[file]
insertMapLink(soup, coords[file][0], coords[file][1], articleTitle, file)
if file in coords:
insertMapLink(soup, coords[file][0], coords[file][1], articleTitle, file)
insertArticleTitle(soup, articleTitle)
@ -182,13 +203,25 @@ for file in thisFiles:
grandParentLink = ancestors[file][2] + ".html" if ancestors[file][2] != "NULL" else False
insertBreadcrumb(soup, articleTitle, parentTitle, parentLink, grandParentTitle, grandParentLink)
insertArticleImage(soup, "header_images/" + file + ".jpg")
articleImage = imageSanitizedPath(articleImages[file])
if articleImage:
insertArticleImage(soup, articleImage)
else:
print "article image not found:", articleImages[file]
writeHtml(soup, file)
writeHtml(soup, file)
imagesDstDir = os.path.join(outDir, "images")
imagesDstDir = os.path.join(outDir, "images_fullsize")
if not os.path.exists(imagesDstDir):
os.makedirs(imagesDstDir)
os.makedirs(imagesDstDir)
for image in imageSet:
shutil.copy2(os.path.join(imagesSrcDir, imageFiles[image]), os.path.join(imagesDstDir, image))
shutil.copy2(os.path.join(imagesSrcDir, imageFiles[image]), os.path.join(imagesDstDir, image))
thumbsDstDir = os.path.join(outDir, "thumb_fullsize")
if not os.path.exists(thumbsDstDir):
os.makedirs(thumbsDstDir)
for k, v in articleImages.iteritems():
if k in thisFiles and sanitizeFileName(v) in imageFiles:
shutil.copy2(os.path.join(imagesSrcDir, imageFiles[sanitizeFileName(v)]), os.path.join(thumbsDstDir, k + ".jpg"))

View file

@ -3,76 +3,81 @@ import sys
import os
import urllib
import shutil
from bs4 import BeautifulSoup
try:
from bs4 import BeautifulSoup
except ImportError:
import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf-8')
def cleanUp(soup):
content = soup.find("div", {"id": "content"})
content = soup.find("div", {"id": "content"})
# remove all specified tags
[s.decompose() for s in content(['noscript'])]
# remove all specified tags
[s.decompose() for s in content(['noscript'])]
[s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})]
[s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})]
[s.decompose() for s in content.findAll("table", {"class": "articleState"})]
[s.decompose() for s in content.findAll("button", {"class": "languageSelector"})]
[s.decompose() for s in content.findAll("a", {"class": "section_anchors"})]
[s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})]
# cut off geo coords as we process them separately in original files
[s.decompose() for s in content.findAll("div", {"id": "geoCoord"})]
# cut off missing images (looks like text File:Image.JPG on pages)
for s in content.findAll("div", {"class": "thumb"}):
if (not s.find("img")):
s.decompose();
[s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})]
[s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})]
[s.decompose() for s in content.findAll("table", {"class": "articleState"})]
[s.decompose() for s in content.findAll("button", {"class": "languageSelector"})]
[s.decompose() for s in content.findAll("a", {"class": "section_anchors"})]
[s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})]
# cut off geo coords as we process them separately in original files
[s.decompose() for s in content.findAll("div", {"id": "geoCoord"})]
# cut off missing images (looks like text File:Image.JPG on pages)
for s in content.findAll("div", {"class": "thumb"}):
if (not s.find("img")):
s.decompose()
# delete empty sections
sections = content.findAll("div", {"class": "section"})
for section in sections:
hasText = 0
for string in section.div.stripped_strings:
hasText += 1
if not hasText:
section.decompose()
# delete empty sections
sections = content.findAll("div", {"class": "section"})
for section in sections:
hasText = 0
for string in section.div.stripped_strings:
hasText += 1
if not hasText:
section.decompose()
# Wrap content with our own header and body, and restore original div structure for css
divContentWrapper = soup.new_tag("div", id="content_wrapper")
divContentWrapper["class"] = "show"
content = content.wrap(divContentWrapper)
content = content.wrap(soup.new_tag("div", id="mw-mf-page-center"))
content = content.wrap(soup.new_tag("div", id="mw-mf-viewport"))
bodyTag = soup.new_tag("body")
bodyTag["class"] = "mediawiki ltr sitedir-ltr mobile stable skin-mobile action-view"
content = content.wrap(bodyTag)
htmlTag = soup.new_tag("html", lang="en", dir="ltr")
htmlTag["class"] = "client-js"
content = content.wrap(htmlTag)
# Here we add our own js and css into the <head>
headTag = soup.new_tag("head")
headTag.append(soup.new_tag("meta", charset="UTF-8"))
headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="css/article.css"))
headTag.append(soup.new_tag("script", type="text/javascript", src="js/article.js"))
meta1 = soup.new_tag("meta", content="yes")
# workaround as "name" is used in python
meta1["name"] = "apple-mobile-web-app-capable"
headTag.append(meta1)
meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6")
meta2["name"] = "viewport"
headTag.append(meta2)
content.body.insert_before(headTag)
# Wrap content with our own header and body, and restore original div structure for css
divContentWrapper = soup.new_tag("div", id="content_wrapper")
divContentWrapper["class"] = "show"
content = content.wrap(divContentWrapper)
content = content.wrap(soup.new_tag("div", id="mw-mf-page-center"))
content = content.wrap(soup.new_tag("div", id="mw-mf-viewport"))
bodyTag = soup.new_tag("body")
bodyTag["class"] = "mediawiki ltr sitedir-ltr mobile stable skin-mobile action-view"
content = content.wrap(bodyTag)
htmlTag = soup.new_tag("html", lang="en", dir="ltr")
htmlTag["class"] = "client-js"
content = content.wrap(htmlTag)
# Here we add our own js and css into the <head>
headTag = soup.new_tag("head")
headTag.append(soup.new_tag("meta", charset="UTF-8"))
headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="css/article.css"))
headTag.append(soup.new_tag("script", type="text/javascript", src="js/article.js"))
meta1 = soup.new_tag("meta", content="yes")
# workaround as "name" is used in python
meta1["name"] = "apple-mobile-web-app-capable"
headTag.append(meta1)
meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6")
meta2["name"] = "viewport"
headTag.append(meta2)
content.body.insert_before(headTag)
return content
return content
if __name__ == '__main__':
if len(sys.argv) < 2:
print "Usage: " + sys.argv[0] + " <inFile> [outFile]"
exit(1)
if len(sys.argv) < 2:
print "Usage: " + sys.argv[0] + " <inFile> [outFile]"
exit(1)
file = sys.argv[1]
soup = BeautifulSoup(open(file))
soup = cleanUp(soup)
file = sys.stdout
if len(sys.argv) > 2:
file = open(sys.argv[2], 'w')
file.write(soup.encode('utf-8'))
file = sys.argv[1]
soup = BeautifulSoup(open(file))
soup = cleanUp(soup)
file = sys.stdout
if len(sys.argv) > 2:
file = open(sys.argv[2], 'w')
file.write(soup.encode('utf-8'))

View file

@ -22,7 +22,10 @@
.articleImage {
width: 100%;
height: auto;
height: 256px;
background-position: center center;
background-repeat: no-repeat;
background-size: cover;
}
.breadcrumbs_wrapper {

View file

@ -52,6 +52,18 @@ function onPageLoaded() {
[].forEach.call(sections, function(section) {
addListener(section, 'click', onSectionClick);
});
var img = new Image();
img.onload = function() {
if (document.getElementById('articleImage').offsetWidth > this.width ) {
var rad = document.getElementById('articleImage').offsetWidth / this.width;
document.getElementById('articleImage').style.webkitFilter = "blur(" + rad + "px)";
rad *= 2;
document.getElementById('articleImage').style.margin = "-" + rad + "px -" + rad + "px -" + rad + "px -" + rad + "px";
};
}
img.src = document.getElementById('articleImage').style.backgroundImage.replace(/url\((['"])?(.*?)\1\)/gi, '$2').split(',')[0];
}
window.onload = onPageLoaded;