Improvements in stipping and making.

This commit is contained in:
Dmitry Kunin 2013-10-16 18:18:23 +02:00
parent da61751b9a
commit ac1b466eb3
3 changed files with 12 additions and 6 deletions

View file

@ -1,4 +1,4 @@
GWMvc=6
GWMvc=7
GWMvn=1.2
GWMpn=com.guidewithme.any
GWMapk=Any_Guide_With_Me

View file

@ -40,10 +40,16 @@ def cleanUp(soup):
sections = content.findAll("h2")
for section in sections:
content_div = section.findNextSibling("div")
if not content_div.text.strip():
print section.text, " : is empty"
content_div.decompose()
section.decompose()
try:
if not content_div.text.strip():
content_div.decompose()
section.decompose()
except:
print "error in :", content_div
# remove all no-print
[tag.decompose for tag in content.findAll(attrs={"class":"noprint"})]
# Wrap content with our own header and body, and restore original div structure for css
divContentWrapper = soup.new_tag("div", id="content_wrapper")

View file

@ -81,7 +81,7 @@ geocodes.txt: geocodes_from_html.txt geocodes_todo.txt
touch geocodes.txt
process_html: clean_up_countries geocodes.txt
cat countries_to_generate.txt | while read country; do mkdir -p Countries/$$country/content/data; ../htmlprocessor/processor.sh articles/ images/ $$country.info.txt $$country.redirect.txt geocodes.txt Countries/$$country/content/data; done
cat countries_to_generate.txt | while read country; do mkdir -p Countries/$$country/content/data; rm Countries/$$country/content/data/*.html; ../htmlprocessor/processor.sh articles/ images/ $$country.info.txt $$country.redirect.txt geocodes.txt Countries/$$country/content/data; done
touch process_html
genindex: geocodes.txt clean_up_countries