From 2cba29d8e15aa748de9f38bbf0f273123afe7166 Mon Sep 17 00:00:00 2001 From: Alexander Zolotarev Date: Tue, 6 Aug 2013 17:23:03 +0300 Subject: [PATCH] Added html processor --- builder/htmlprocessor/strip.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 builder/htmlprocessor/strip.py diff --git a/builder/htmlprocessor/strip.py b/builder/htmlprocessor/strip.py new file mode 100644 index 0000000..5e69112 --- /dev/null +++ b/builder/htmlprocessor/strip.py @@ -0,0 +1,24 @@ +# Gets clean content from raw html + +import sys + +if len(sys.argv) == 1: + print "Usage: " + sys.argv[0] + " [optional output file]" + exit(1) + +from bs4 import BeautifulSoup + +soup = BeautifulSoup(open(sys.argv[1])) +content = soup.find("div", {"id": "content"}) + +# remove all specified tags +[s.extract() for s in content(['noscript'])] + +content.find("a", {"id": "mw-mf-last-modified"}).extract() +[s.extract() for s in content.findAll("span", {"class": "mw-editsection"})] +[s.extract() for s in content.findAll("table", {"class": "articleState"})] + +if len(sys.argv) == 3: + open(sys.argv[2], "w").write(content.prettify().encode('utf-8')) +else: + print(content.prettify().encode('utf-8'))