image links rewrite

2013-08-07 16:19:01 +03:00 · 2013-08-07 16:19:01 +03:00 · 5ed265561e
commit 5ed265561e
parent 9c7f09ce6b
2 changed files with 82 additions and 36 deletions
--- a/builder/htmlprocessor/processor.sh
+++ b/builder/htmlprocessor/processor.sh
@ -0,0 +1,5 @@
+
+for interval in 0 1 2 3 4 5 6 7;
+  do python strip.py $1 $2 $3 $interval 8
+  done
+  
--- a/builder/htmlprocessor/strip.py
+++ b/builder/htmlprocessor/strip.py
@ -1,46 +1,87 @@
 import sys
+import os
+import urllib

-if len(sys.argv) == 1:
-  print "Usage: " + sys.argv[0] + " <html article file> [optional output file]"
-  exit(1)
+reload(sys)
+sys.setdefaultencoding('utf-8')

 from bs4 import BeautifulSoup

-soup = BeautifulSoup(open(sys.argv[1]))
-content = soup.find("div", {"id": "content"})
+def cleanUp(soup):
+  content = soup.find("div", {"id": "content"})

-# remove all specified tags
-[s.decompose() for s in content(['noscript'])]
+  # remove all specified tags
+  [s.decompose() for s in content(['noscript'])]

-[s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})]
-[s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})]
-[s.decompose() for s in content.findAll("table", {"class": "articleState"})]
-[s.decompose() for s in content.findAll("button", {"class": "languageSelector"})]
-[s.decompose() for s in content.findAll("a", {"class": "section_anchors"})]
-[s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})]
+  [s.decompose() for s in content.findAll("a", {"id": "mw-mf-last-modified"})]
+  [s.decompose() for s in content.findAll("span", {"class": "mw-editsection"})]
+  [s.decompose() for s in content.findAll("table", {"class": "articleState"})]
+  [s.decompose() for s in content.findAll("button", {"class": "languageSelector"})]
+  [s.decompose() for s in content.findAll("a", {"class": "section_anchors"})]
+  [s.decompose() for s in content.findAll("div", {"id": "mw-mf-language-section"})]

-# Wrap content with our own header and body
-content = content.wrap(soup.new_tag("body"))
-content = content.wrap(soup.new_tag("html"))
-# Here we add our own js and css into the <head>
-headTag = soup.new_tag("head")
-cType = soup.new_tag("meta", content="text/html; charset=UTF-8")
-# workaround as we can't use dashes in python names
-cType["http-equiv"] = "Content-Type"
-headTag.append(cType)
-headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="article.css"))
-headTag.append(soup.new_tag("script", type="text/javascript", href="article.js"))
-meta1 = soup.new_tag("meta", content="yes")
-# workaround as "name" is used in python
-meta1["name"] = "apple-mobile-web-app-capable"
-headTag.append(meta1)
-meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6")
-meta2["name"] = "viewport"
-headTag.append(meta2)
-content.body.insert_before(headTag)
+  # Wrap content with our own header and body
+  content = content.wrap(soup.new_tag("body"))
+  content = content.wrap(soup.new_tag("html"))
+  # Here we add our own js and css into the <head>
+  headTag = soup.new_tag("head")
+  cType = soup.new_tag("meta", content="text/html; charset=UTF-8")
+  # workaround as we can't use dashes in python names
+  cType["http-equiv"] = "Content-Type"
+  headTag.append(cType)
+  headTag.append(soup.new_tag("link", rel="stylesheet", type="text/css", href="article.css"))
+  headTag.append(soup.new_tag("script", type="text/javascript", href="article.js"))
+  meta1 = soup.new_tag("meta", content="yes")
+  # workaround as "name" is used in python
+  meta1["name"] = "apple-mobile-web-app-capable"
+  headTag.append(meta1)
+  meta2 = soup.new_tag("meta", content="initial-scale=1.0, user-scalable=yes, minimum-scale=0.25, maximum-scale=1.6")
+  meta2["name"] = "viewport"
+  headTag.append(meta2)
+  content.body.insert_before(headTag)
+  return content
+
+def imageExist(fileName):
+  global imageFiles
+  return urllib.unquote(fileName).lower() in imageFiles


-if len(sys.argv) == 3:
-  open(sys.argv[2], "w").write(content.prettify().encode('utf-8'))
-else:
-  print(content.prettify().encode('utf-8'))
+def rewriteImages(soup):
+  imgTag = soup.findAll("img");
+
+  for imgElement in imgTag:
+    del imgElement['alt']
+    #todo rewrite srcset attr if we can get callback on image loading in webview
+    del imgElement['srcset']
+    
+    index = -1
+    if "thumb" in imgElement['src'] and not '.pdf' in imgElement['src'].split("/")[-2]:
+      index = -2
+    imageName = imgElement['src'].split("/")[index]
+    if imageExist(imageName):
+      imgElement['src'] = "images/" + imageName
+    else:
+      [s.decompose() for s in imgElement.fetchParents("div", {"class" : "thumb tright"})]
+
+def writeHtml(content, fileName):
+  global outDir
+  open(os.path.join(outDir, fileName), "w").write(content.prettify().encode('utf-8'))
+
+##############################################################################
+if len(sys.argv) < 6:
+  print "Usage: " + sys.argv[0] + " <directory with html articles> <images directory> <output directory> <threadIndex> <cpu core count>"
+  exit(1)
+
+inDir = sys.argv[1]
+imageFiles = [unicode((urllib.unquote(file)).lower()) for file in os.listdir(sys.argv[2])]
+outDir = sys.argv[3]
+threadIndex = int(sys.argv[4])
+coreCount = int(sys.argv[5])
+files = [urllib.unquote(file) for file in os.listdir(sys.argv[1])]
+thisFiles = files[threadIndex * len(files) / coreCount : (threadIndex + 1) * len(files) / coreCount]
+
+for file in thisFiles:
+  soup = BeautifulSoup(open(os.path.join(inDir, file)))
+  soup = cleanUp(soup)
+  rewriteImages(soup)
+  writeHtml(soup, file)