Improvements in stipping and making.
This commit is contained in:
parent
da61751b9a
commit
ac1b466eb3
3 changed files with 12 additions and 6 deletions
|
@ -1,4 +1,4 @@
|
|||
GWMvc=6
|
||||
GWMvc=7
|
||||
GWMvn=1.2
|
||||
GWMpn=com.guidewithme.any
|
||||
GWMapk=Any_Guide_With_Me
|
||||
|
|
|
@ -40,10 +40,16 @@ def cleanUp(soup):
|
|||
sections = content.findAll("h2")
|
||||
for section in sections:
|
||||
content_div = section.findNextSibling("div")
|
||||
if not content_div.text.strip():
|
||||
print section.text, " : is empty"
|
||||
content_div.decompose()
|
||||
section.decompose()
|
||||
try:
|
||||
if not content_div.text.strip():
|
||||
content_div.decompose()
|
||||
section.decompose()
|
||||
except:
|
||||
print "error in :", content_div
|
||||
|
||||
# remove all no-print
|
||||
[tag.decompose for tag in content.findAll(attrs={"class":"noprint"})]
|
||||
|
||||
|
||||
# Wrap content with our own header and body, and restore original div structure for css
|
||||
divContentWrapper = soup.new_tag("div", id="content_wrapper")
|
||||
|
|
|
@ -81,7 +81,7 @@ geocodes.txt: geocodes_from_html.txt geocodes_todo.txt
|
|||
touch geocodes.txt
|
||||
|
||||
process_html: clean_up_countries geocodes.txt
|
||||
cat countries_to_generate.txt | while read country; do mkdir -p Countries/$$country/content/data; ../htmlprocessor/processor.sh articles/ images/ $$country.info.txt $$country.redirect.txt geocodes.txt Countries/$$country/content/data; done
|
||||
cat countries_to_generate.txt | while read country; do mkdir -p Countries/$$country/content/data; rm Countries/$$country/content/data/*.html; ../htmlprocessor/processor.sh articles/ images/ $$country.info.txt $$country.redirect.txt geocodes.txt Countries/$$country/content/data; done
|
||||
touch process_html
|
||||
|
||||
genindex: geocodes.txt clean_up_countries
|
||||
|
|
Reference in a new issue