[generator] Added optional langs and beautify_page

This commit is contained in:
Maksim Andrianov 2018-12-05 14:26:41 +03:00 committed by mpimenov
parent 2bc0ca8a69
commit db39821893
2 changed files with 103 additions and 23 deletions

View file

@ -1,10 +1,14 @@
import os
import re
import argparse
import functools
import logging
import itertools
import urllib.parse
import wikipediaapi
import htmlmin
from multiprocessing.pool import ThreadPool
from bs4 import BeautifulSoup
"""
This script downloads Wikipedia pages for different languages.
"""
@ -13,8 +17,62 @@ log = logging.getLogger(__name__)
WORKERS = 16
CHUNK_SIZE = 64
HEADERS = {f"h{x}" for x in range(1,7)}
BAD_SECTIONS = {
"en": ["External links", "Sources", "See also", "Bibliography", "Further reading"],
"ru": ["Литература", "Ссылки", "См. также"],
"es": ["Vínculos de interés", "Véase también", "Enlaces externos"]
def download(dir, url):
}
def remove_bad_sections(soup, lang):
if lang not in BAD_SECTIONS:
return soup
it = iter(soup.find_all())
current = next(it, None)
while current is not None:
if current.name in HEADERS and current.text.strip() in BAD_SECTIONS[lang]:
current.extract()
current = next(it, None)
while current is not None:
if current.name in HEADERS:
break
current.extract()
current = next(it, None)
else:
current = next(it, None)
return soup
def remove_empty_sections(soup):
prev = None
for x in soup.find_all():
if prev is not None and x.name in HEADERS and prev.name in HEADERS:
prev.extract()
prev = x
return soup
def beautify_page(html, lang):
soup = BeautifulSoup(html, "html")
for x in soup.find_all():
if len(x.text.strip()) == 0:
x.extract()
soup = remove_empty_sections(soup)
soup = remove_bad_sections(soup, lang)
html = str(soup.prettify())
html = htmlmin.minify(html, remove_empty_space=True)
return html
def need_lang(lang, langs):
return lang in langs if langs else True
def download(directory, url):
url = urllib.parse.unquote(url)
parsed = urllib.parse.urlparse(url)
try:
@ -22,7 +80,7 @@ def download(dir, url):
except (AttributeError, IndexError):
log.exception(f"{parsed.netloc} is incorrect.")
return None
path = os.path.join(dir, f"{lang}.html")
path = os.path.join(directory, f"{lang}.html")
if os.path.exists(path):
log.warning(f"{path} already exists.")
return None
@ -37,34 +95,50 @@ def download(dir, url):
text = page.text
page_size = len(text)
if page_size:
references = "<h2>References</h2>"
index = text.find(references)
if index >= 0:
text = text[:index] + text[index + len(references):]
text = beautify_page(text, lang)
log.info(f"Save to {path} {lang} {page_name} {page_size}.")
os.makedirs(directory, exist_ok=True)
with open(path, "w") as file:
file.write(text)
else:
log.warning(f"Page {url} is empty. It has not been saved.")
return page
return text
def download_all(path, url):
page = download(path, url)
if page is None:
return
def get_wiki_langs(url):
url = urllib.parse.unquote(url)
parsed = urllib.parse.urlparse(url)
try:
lang_links = page.langlinks
lang = parsed.netloc.split(".", maxsplit=1)[0]
except (AttributeError, IndexError):
log.exception(f"{parsed.netloc} is incorrect.")
return None
wiki = wikipediaapi.Wikipedia(language=lang,
extract_format=wikipediaapi.ExtractFormat.HTML)
try:
page_name = parsed.path.rsplit("/", maxsplit=1)[-1]
except (AttributeError, IndexError):
log.exception(f"{parsed.path} is incorrect.")
return None
page = wiki.page(page_name)
my_lang = [(lang, url), ]
try:
langlinks = page.langlinks
return list(zip(langlinks.keys(),
[link.fullurl for link in langlinks.values()])) + my_lang
except KeyError as e:
log.warning(f"No languages for {url} ({e}).")
return
for link in lang_links.values():
download(path, link.fullurl)
return my_lang
def worker(output_dir):
def download_all(path, url, langs):
available_langs = get_wiki_langs(url)
available_langs = filter(lambda x: need_lang(x[0], langs), available_langs)
for lang in available_langs:
download(path, lang[1])
def worker(output_dir, langs):
@functools.wraps(worker)
def wrapped(line):
try:
@ -75,17 +149,20 @@ def worker(output_dir):
url = url.strip()
parsed = urllib.parse.urlparse(url)
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
os.makedirs(path, exist_ok=True)
download_all(path, url)
download_all(path, url, langs)
return wrapped
def parse_args():
parser = argparse.ArgumentParser(description="Download wiki pages.")
parser.add_argument("o", metavar="PATH", type=str,
parser.add_argument("--o", metavar="PATH", type=str,
help="Output dir for saving pages")
parser.add_argument('--i', metavar="PATH", type=str, required=True,
help="Input file with wikipedia url.")
parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+',
action='append',
help="Languages for pages. If left blank, pages in all "
"available languages will be loaded.")
return parser.parse_args()
@ -95,11 +172,13 @@ def main():
args = parse_args()
input_file = args.i
output_dir = args.o
langs = list(itertools.chain.from_iterable(args.langs))
os.makedirs(output_dir, exist_ok=True)
with open(input_file) as file:
_ = file.readline()
pool = ThreadPool(processes=WORKERS)
pool.map(worker(output_dir), file, CHUNK_SIZE)
pool.map(worker(output_dir, langs), file, CHUNK_SIZE)
pool.close()
pool.join()

View file

@ -575,9 +575,10 @@ if [ "$MODE" == "descriptions" ]; then
URLS_PATH="$INTDIR/wiki_urls.txt"
WIKI_PAGES_PATH="$INTDIR/descriptions"
LOG="$LOG_PATH/descriptions.log"
LANGS="en ru es"
"$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG
$PYTHON36 $DESCRIPTIONS_DOWNLOADER --i="$URLS_PATH" "$WIKI_PAGES_PATH" 2>> $LOG
$PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs "$LANGS" 2>> $LOG
for file in "$TARGET"/*.mwm; do
if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then