Merge pull request #4 from Komzpa/generator

Generator
2013-08-13 08:44:34 -07:00 · 2013-08-13 08:44:34 -07:00 · 0d4b1ba779
commit 0d4b1ba779
parent b3d59103b5 2e088f6c1c
5 changed files with 254 additions and 176 deletions
--- a/builder/htmlprocessor/convert_img.sh
+++ b/builder/htmlprocessor/convert_img.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+if test "$1" == "" ; then
+    echo "USAGE: $0 [thumbsInDir] [imagesInDir] [outDir]"
+    exit
+fi
+
+outdir = $3
+
+mkdir -p $3
+
+mkdir -p $3/thumb
+pushd $1
+    for i in *.png; do convert $i -auto-orient -quality 53 -thumbnail '256x256>' $outdir/thumb/$(basename -s.png $i).jpg; echo $i; done
+    for i in *.svg; do convert $i -auto-orient -quality 53 -thumbnail '256x256>' $outdir/thumb/$(basename -s.svg $i).jpg; echo $i; done
+    for i in *.jpg *.JPG *.jpeg; do convert  -define jpeg:size=400x280 $i -auto-orient -quality 53 -thumbnail '500x280>' -strip -liquid-rescale '256x256!>' $outdir/thumb/$i; echo $i; done
+popd
+
+mkdir -p $3/images
+pushd $2
+    for i in *.jpg; do convert $i -auto-orient -quality 53 -strip -thumbnail '1536x1536>' $outdir/images/$i; echo $i; done
+    for i in *.png; do convert $i -auto-orient -quality 99 -strip -thumbnail '4000x3000>' PNG8:$outdir/images/$i; echo $i; done
+    cp *.svg $outdir/images/
+popd
+
--- a/builder/htmlprocessor/strip.py
+++ b/builder/htmlprocessor/strip.py
@ -4,7 +4,10 @@ import os
 import urllib
 import shutil
 import unicodedata
-from bs4 import BeautifulSoup
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    import BeautifulSoup

 reload(sys)
 sys.setdefaultencoding('utf-8')
@ -13,166 +16,184 @@ from strip_function import cleanUp


 def insertMapLink(soup, lat, lon, title, pageId):
-  hrefLink = "mapswithme://map?v=1&ll=%s,%s&n=%s&id=%s&backurl=guideswithme&appname=Guides%%20With%%20Me"%(lat, lon, urllib.quote(title), pageId)
-  mapTag = BeautifulSoup().new_tag("a", href=hrefLink)
-  mapTag["class"] = "geolink";
-  soup.body.insert(0, mapTag)
+    hrefLink = "mapswithme://map?v=1&ll=%s,%s&n=%s&id=%s&backurl=guideswithme&appname=Guides%%20With%%20Me" % (lat, lon, urllib.quote(title), pageId)
+    mapTag = BeautifulSoup().new_tag("a", href=hrefLink)
+    mapTag["class"] = "geolink"
+    soup.body.insert(0, mapTag)


 def insertArticleTitle(soup, articleTitle):
-  titleTag = BeautifulSoup().new_tag("div")
-  titleTag["class"] = "articleHeader"
-  titleTag.append(articleTitle)
-  soup.body.insert(0, titleTag)
+    titleTag = BeautifulSoup().new_tag("div")
+    titleTag["class"] = "articleHeader"
+    titleTag.append(articleTitle)
+    soup.body.insert(0, titleTag)


 def insertArticleImage(soup, imagePath):
-  imgTag = BeautifulSoup().new_tag("img", src=imagePath)
-  imgTag["class"] = "articleImage"
-  soup.body.insert(0, imgTag)
+    imgTag = BeautifulSoup().new_tag("img", style="background-image: url('%s')" % imagePath, id="articleImage")
+    imgTag["class"] = "articleImage"
+    soup.body.insert(0, imgTag)


 def insertBreadcrumb(soup, articleTitle, parentTitle, parentLink, grandParentTitle, grandParentLink):
-  tagFactory = BeautifulSoup()
+    tagFactory = BeautifulSoup()

-  bcWrapper = tagFactory.new_tag("div")
-  bcWrapper["class"] = "breadcrumbs_wrapper"
-  if (grandParentTitle):
-    grandParentTag = tagFactory.new_tag("a", href=grandParentLink)
-    grandParentTag["class"] = "breadcrumb bc1"
-    grandParentTag.append(grandParentTitle)
-    bcWrapper.append(grandParentTag)
-  if (parentTitle):
-    parentTag = tagFactory.new_tag("a", href=parentLink)
-    parentTag["class"] = "breadcrumb bc2"
-    parentTag.append(parentTitle)
-    bcWrapper.append(parentTag)
-  currTag = tagFactory.new_tag("span")
-  currTag["class"] = "breadcrumb bc3"
-  currTag.append(articleTitle)
-  bcWrapper.append(currTag)
-  soup.body.insert(0, bcWrapper)
+    bcWrapper = tagFactory.new_tag("div")
+    bcWrapper["class"] = "breadcrumbs_wrapper"
+    if (grandParentTitle):
+        grandParentTag = tagFactory.new_tag("a", href=grandParentLink)
+        grandParentTag["class"] = "breadcrumb bc1"
+        grandParentTag.append(grandParentTitle)
+        bcWrapper.append(grandParentTag)
+    if (parentTitle):
+        parentTag = tagFactory.new_tag("a", href=parentLink)
+        parentTag["class"] = "breadcrumb bc2"
+        parentTag.append(parentTitle)
+        bcWrapper.append(parentTag)
+    currTag = tagFactory.new_tag("span")
+    currTag["class"] = "breadcrumb bc3"
+    currTag.append(articleTitle)
+    bcWrapper.append(currTag)
+    soup.body.insert(0, bcWrapper)


-
-def transformStringWithEncoding(str):
-  return urllib.unquote(str.decode("latin-1").encode("utf-8"))
+def transformStringWithEncoding(string):
+    return urllib.unquote(string.decode("latin-1").encode("utf-8"))


-def transformString(s):
-  unquoted = urllib.unquote(str(s));
-  for i in u"\"',/\\@#$%^&*()!~`«»":
-    unquoted = unquoted.replace(i, "_")
-  return unicode(unquoted.strip("_"))
-  
-def formatToNFKD(s):
-  return unicodedata.normalize("NFKD", transformString(s))
-  
-def unicodeNormalize(s):
-  return (u"".join( x for x in formatToNFKD(s) if not unicodedata.category(x).startswith("M"))).lower()
+def sanitizeFileName(s):
+    # unquote %20 and like that
+    s = urllib.unquote(str(s))

-def imageExist(fileName):
-  global imageFiles
-  global imageSet
-  unquotedName = unicodeNormalize(fileName)
-  if unquotedName in imageFiles:
-    imageSet.add(unquotedName)
-    return True
+    # remove punctuation
+    for i in u"\"',/\\@#$%^&*()!~`«»":
+        s = s.replace(i, "_")
+    s = s.strip("_")

-  return False
+    # normalize unicode to NFKD (form with modifiers separated)
+    s = unicodedata.normalize("NFKD", s)
+
+    # drop modifiers from string (no diacritics)
+    s = u"".join(x for x in s if not unicodedata.category(x).startswith("M"))
+
+    # lowercase
+    s = s.lower()
+    return s
+
+
+def imageSanitizedPath(fileName):
+    """
+    return path to image file if it's in dataset
+    """
+    global imageFiles
+    global imageSet
+    unquotedName = sanitizeFileName(fileName)
+    if unquotedName in imageFiles:
+        imageSet.add(unquotedName)
+        return 'images/' + unquotedName


 def rewriteImages(soup):
-  imgTag = soup.findAll("img");
+    imgTag = soup.findAll("img")

-  for imgElement in imgTag:
-    del imgElement["alt"]
-    #todo rewrite srcset attr if we can get callback on image loading in webview
-    del imgElement["srcset"]
+    for imgElement in imgTag:
+        del imgElement["alt"]
+        # todo rewrite srcset attr if we can get callback on image loading in webview
+        del imgElement["srcset"]
+
+        index = -1
+        splitSrc = imgElement["src"].split("/")
+        splitSrc.reverse()
+        # checking just two last elements (preview name, real name)
+        for fileName in splitSrc[:2]:
+            fileName = imageSanitizedPath(fileName)
+            if fileName:
+                imgElement["src"] = fileName
+                break
+        else:
+            print "Stripping image", imgElement["src"]
+            [s.decompose() for s in imgElement.fetchParents("div", {"class": ["thumb tright", "thumbinner", "image"]})]

-    index = -1
-    srcPath = imgElement["src"]
-    splitedSrc = srcPath.split("/")
-    if imageExist(splitedSrc[-1]):
-      imgElement['src'] = "images/" + unicodeNormalize(splitedSrc[-1])
-    elif imageExist(splitedSrc[-2]):
-      imgElement['src'] = "images/" + unicodeNormalize(splitedSrc[-2])
-    else:
-      print "Image strip = " + unicodeNormalize(splitedSrc[-2])
-      [s.decompose() for s in imgElement.fetchParents("div", {"class" : ["thumb tright", "thumbinner", "image"]})]

 def rewriteCrossLinks(soup):
-  global idMapping
-  global redirectMapping
-  links = soup.findAll("a")
+    global idMapping
+    global redirectMapping
+    links = soup.findAll("a")

-  for link in links:
-    destTitle = link["href"].split("/",2)[-1]
-    destTitle = transformStringWithEncoding(destTitle)
-    destTitle = redirectMapping.get(destTitle, destTitle);
+    for link in links:
+        destTitle = link["href"].split("/", 2)[-1]
+        destTitle = transformStringWithEncoding(destTitle)
+        destTitle = redirectMapping.get(destTitle, destTitle)

-    if destTitle in idMapping:
-      link["href"] = idMapping.get(destTitle, link["href"]) + ".html"
-      continue
+        if destTitle in idMapping:
+            link["href"] = idMapping.get(destTitle, link["href"]) + ".html"
+            continue

-    if "/wiki/File:" in link["href"] and "http" not in link["href"] and "www" not in link["href"]:
-      imgElement = link.find("img")
-      if imgElement:
-        link["href"] = imgElement["src"]
-        continue
+        if "/wiki/File:" in link["href"] and "http" not in link["href"] and "www" not in link["href"]:
+            imgElement = link.find("img")
+            if imgElement:
+                link["href"] = imgElement["src"]
+                continue
+
+        if "/wiki/" in link["href"]:
+            if link.string:
+                link.replace_with(link.string)
+            else:
+                link.replace_with("")

-    if "/wiki/" in link["href"]:
-      if link.string:
-        link.replace_with(link.string)
-      else:
-        link.replace_with("")

 def writeHtml(content, fileName):
-  global outDir
-  open(os.path.join(outDir, fileName + ".html"), "w").write(content.encode('utf-8'))
+    global outDir
+    open(os.path.join(outDir, fileName + ".html"), "w").write(content.encode('utf-8'))
+

 def fixTitle(title):
-  return title.split('/')[-1].replace('_', ' ')
+    return title.split('/')[-1].replace('_', ' ')

-##############################################################################
 if len(sys.argv) < 9:
-  print "Usage: " + sys.argv[0] + " <directory with html articles> <images directory> <article set info file> <redirect info file> <geocoords file> <output directory> <threadIndex> <cpu core count>"
-  exit(1)
+    print "Usage: " + sys.argv[0] + " <directory with html articles> <images directory> <article set info file> <redirect info file> <geocoords file> <output directory> <threadIndex> <cpu core count>"
+    exit(1)

 inDir = sys.argv[1]
+
 imagesSrcDir = sys.argv[2]
-imageFiles = dict([(unicodeNormalize(file), file) for file in os.listdir(imagesSrcDir)])
-idMapping = dict([(unicode(i.split("\t")[1]), unicode(i.split("\t")[0])) for i in open(sys.argv[3])])
+imageFiles = dict([(sanitizeFileName(file), file) for file in os.listdir(imagesSrcDir)])
+
+idMappingFile = sys.argv[3]
+idMapping = dict([(unicode(i.split("\t")[1]), unicode(i.split("\t")[0])) for i in open(idMappingFile)])
+
+articleImages = dict([(i.split("\t")[0], i.strip().split("\t")[3]) for i in open(sys.argv[3])])
+
 # pageId => [parentId, parentTitle, grandParentId, grandParentTitle], ids and titles can be "NULL"
 ancestors = dict([(i.split("\t")[0], i.strip().split("\t")[4:8]) for i in open(sys.argv[3])])
+
 redirectMapping = dict([(unicode(line.split("\t")[1]), unicode(line.split("\t")[3].strip())) for line in open(sys.argv[4])])
+
 coords = dict([(line.split("\t")[0], (line.split("\t")[1], line.split("\t")[2])) for line in open(sys.argv[5])])
-pageIdToTitle = {}
-for key, value in idMapping.iteritems():
-  if value in coords:
-    pageIdToTitle[value] = fixTitle(str(key))
+
+pageIdToTitle = {v: fixTitle(str(k)) for k, v in idMapping.iteritems()}
+
 outDir = sys.argv[6]
 threadIndex = int(sys.argv[7])
 coreCount = int(sys.argv[8])
 files = [urllib.unquote(file) for file in idMapping.values()]
-thisFiles = files[threadIndex * len(files) / coreCount : (threadIndex + 1) * len(files) / coreCount]
+
+thisFiles = files[threadIndex * len(files) / coreCount: (threadIndex + 1) * len(files) / coreCount]
 imageSet = set()

-# preload coords
-
 if not os.path.exists(outDir):
-  os.makedirs(outDir)
+    os.makedirs(outDir)

 for file in thisFiles:
-  soup = BeautifulSoup(open(os.path.join(inDir, file)))
-  soup = cleanUp(soup)
-  rewriteImages(soup)
-  rewriteCrossLinks(soup)
-  # insert article "header" - image with breadcrumbs and map link
-  if file in coords:
+    soup = BeautifulSoup(open(os.path.join(inDir, file)))
+    soup = cleanUp(soup)
+    rewriteImages(soup)
+    rewriteCrossLinks(soup)
+
    articleTitle = pageIdToTitle[file]

-    insertMapLink(soup, coords[file][0], coords[file][1], articleTitle, file)
+    if file in coords:
+        insertMapLink(soup, coords[file][0], coords[file][1], articleTitle, file)

    insertArticleTitle(soup, articleTitle)

@ -182,13 +203,25 @@ for file in thisFiles:
    grandParentLink = ancestors[file][2] + ".html" if ancestors[file][2] != "NULL" else False
    insertBreadcrumb(soup, articleTitle, parentTitle, parentLink, grandParentTitle, grandParentLink)

-    insertArticleImage(soup, "header_images/" + file + ".jpg")
+    articleImage = imageSanitizedPath(articleImages[file])
+    if articleImage:
+        insertArticleImage(soup, articleImage)
+    else:
+        print "article image not found:", articleImages[file]

-  writeHtml(soup, file)
+    writeHtml(soup, file)

-imagesDstDir = os.path.join(outDir, "images")
+imagesDstDir = os.path.join(outDir, "images_fullsize")
 if not os.path.exists(imagesDstDir):
-  os.makedirs(imagesDstDir)
+    os.makedirs(imagesDstDir)

 for image in imageSet:
-  shutil.copy2(os.path.join(imagesSrcDir, imageFiles[image]), os.path.join(imagesDstDir, image))
+    shutil.copy2(os.path.join(imagesSrcDir, imageFiles[image]), os.path.join(imagesDstDir, image))
+
+thumbsDstDir = os.path.join(outDir, "thumb_fullsize")
+if not os.path.exists(thumbsDstDir):
+    os.makedirs(thumbsDstDir)
+
+for k, v in articleImages.iteritems():
+    if k in thisFiles and sanitizeFileName(v) in imageFiles:
+        shutil.copy2(os.path.join(imagesSrcDir, imageFiles[sanitizeFileName(v)]), os.path.join(thumbsDstDir, k + ".jpg"))
--- a/builder/htmlprocessor/strip_function.py
+++ b/builder/htmlprocessor/strip_function.py
@ -3,76 +3,81 @@ import sys
 import os
 import urllib
 import shutil
-from bs4 import BeautifulSoup
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    import BeautifulSoup
+

 reload(sys)
 sys.setdefaultencoding('utf-8')

+
 def cleanUp(soup):
-  content = soup.find("div", {"id": "content"})
+    content = soup.find("div", {"id": "content"})

-  # remove all specified tags
-  [s.decompose() for s in content(['noscript'])]
+    # remove all specified tags
+    [s.decompose() for s in content(['noscript'])]

-  [s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})]
-  [s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})]
-  [s.decompose() for s in content.findAll("table", {"class": "articleState"})]
-  [s.decompose() for s in content.findAll("button", {"class": "languageSelector"})]
-  [s.decompose() for s in content.findAll("a", {"class": "section_anchors"})]
-  [s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})]
-  # cut off geo coords as we process them separately in original files
-  [s.decompose() for s in content.findAll("div", {"id": "geoCoord"})]
-  # cut off missing images (looks like text File:Image.JPG on pages)
-  for s in content.findAll("div", {"class": "thumb"}):
-    if (not s.find("img")):
-      s.decompose();
+    [s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})]
+    [s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})]
+    [s.decompose() for s in content.findAll("table", {"class": "articleState"})]
+    [s.decompose() for s in content.findAll("button", {"class": "languageSelector"})]
+    [s.decompose() for s in content.findAll("a", {"class": "section_anchors"})]
+    [s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})]
+    # cut off geo coords as we process them separately in original files
+    [s.decompose() for s in content.findAll("div", {"id": "geoCoord"})]
+    # cut off missing images (looks like text File:Image.JPG on pages)
+    for s in content.findAll("div", {"class": "thumb"}):
+        if (not s.find("img")):
+            s.decompose()

-  # delete empty sections
-  sections = content.findAll("div", {"class": "section"})
-  for section in sections:
-    hasText = 0
-    for string in section.div.stripped_strings:
-      hasText += 1
-    if not hasText:
-      section.decompose()
+    # delete empty sections
+    sections = content.findAll("div", {"class": "section"})
+    for section in sections:
+        hasText = 0
+        for string in section.div.stripped_strings:
+            hasText += 1
+        if not hasText:
+            section.decompose()

-  # Wrap content with our own header and body, and restore original div structure for css
-  divContentWrapper = soup.new_tag("div", id="content_wrapper")
-  divContentWrapper["class"] = "show"
-  content = content.wrap(divContentWrapper)
-  content = content.wrap(soup.new_tag("div", id="mw-mf-page-center"))
-  content = content.wrap(soup.new_tag("div", id="mw-mf-viewport"))
-  bodyTag = soup.new_tag("body")
-  bodyTag["class"] = "mediawiki ltr sitedir-ltr mobile stable skin-mobile action-view"
-  content = content.wrap(bodyTag)
-  htmlTag = soup.new_tag("html", lang="en", dir="ltr")
-  htmlTag["class"] = "client-js"
-  content = content.wrap(htmlTag)
-  # Here we add our own js and css into the <head>
-  headTag = soup.new_tag("head")
-  headTag.append(soup.new_tag("meta", charset="UTF-8"))
-  headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="css/article.css"))
-  headTag.append(soup.new_tag("script", type="text/javascript", src="js/article.js"))
-  meta1 = soup.new_tag("meta", content="yes")
-  # workaround as "name" is used in python
-  meta1["name"] = "apple-mobile-web-app-capable"
-  headTag.append(meta1)
-  meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6")
-  meta2["name"] = "viewport"
-  headTag.append(meta2)
-  content.body.insert_before(headTag)
+    # Wrap content with our own header and body, and restore original div structure for css
+    divContentWrapper = soup.new_tag("div", id="content_wrapper")
+    divContentWrapper["class"] = "show"
+    content = content.wrap(divContentWrapper)
+    content = content.wrap(soup.new_tag("div", id="mw-mf-page-center"))
+    content = content.wrap(soup.new_tag("div", id="mw-mf-viewport"))
+    bodyTag = soup.new_tag("body")
+    bodyTag["class"] = "mediawiki ltr sitedir-ltr mobile stable skin-mobile action-view"
+    content = content.wrap(bodyTag)
+    htmlTag = soup.new_tag("html", lang="en", dir="ltr")
+    htmlTag["class"] = "client-js"
+    content = content.wrap(htmlTag)
+    # Here we add our own js and css into the <head>
+    headTag = soup.new_tag("head")
+    headTag.append(soup.new_tag("meta", charset="UTF-8"))
+    headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="css/article.css"))
+    headTag.append(soup.new_tag("script", type="text/javascript", src="js/article.js"))
+    meta1 = soup.new_tag("meta", content="yes")
+    # workaround as "name" is used in python
+    meta1["name"] = "apple-mobile-web-app-capable"
+    headTag.append(meta1)
+    meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6")
+    meta2["name"] = "viewport"
+    headTag.append(meta2)
+    content.body.insert_before(headTag)

-  return content
+    return content

 if __name__ == '__main__':
-  if len(sys.argv) < 2:
-    print "Usage: " + sys.argv[0] + " <inFile> [outFile]"
-    exit(1)
+    if len(sys.argv) < 2:
+        print "Usage: " + sys.argv[0] + " <inFile> [outFile]"
+        exit(1)

-  file = sys.argv[1]
-  soup = BeautifulSoup(open(file))
-  soup = cleanUp(soup)
-  file = sys.stdout
-  if len(sys.argv) > 2:
-    file = open(sys.argv[2], 'w')
-  file.write(soup.encode('utf-8'))
+    file = sys.argv[1]
+    soup = BeautifulSoup(open(file))
+    soup = cleanUp(soup)
+    file = sys.stdout
+    if len(sys.argv) > 2:
+        file = open(sys.argv[2], 'w')
+    file.write(soup.encode('utf-8'))
--- a/data/css/article.css
+++ b/data/css/article.css
@ -22,7 +22,10 @@

 .articleImage {
  width: 100%;
-  height: auto;
+  height: 256px;
+  background-position: center center;
+  background-repeat: no-repeat;
+  background-size: cover;
 }

 .breadcrumbs_wrapper {
--- a/data/js/article.js
+++ b/data/js/article.js
@ -52,6 +52,18 @@ function onPageLoaded() {
  [].forEach.call(sections, function(section) {
    addListener(section, 'click', onSectionClick);
  });
+
+  var img = new Image();
+  img.onload = function() {
+    if (document.getElementById('articleImage').offsetWidth > this.width ) {
+      var rad = document.getElementById('articleImage').offsetWidth / this.width;
+      document.getElementById('articleImage').style.webkitFilter = "blur(" + rad + "px)";
+      rad *= 2;
+      document.getElementById('articleImage').style.margin = "-" + rad + "px -" + rad + "px -" + rad + "px -" + rad + "px";
+    };
+  }
+  img.src = document.getElementById('articleImage').style.backgroundImage.replace(/url\((['"])?(.*?)\1\)/gi, '$2').split(',')[0];
+
 }

 window.onload = onPageLoaded;