From 3dfe90f9381baf91d2b1689eda8fdfee4c7a8da5 Mon Sep 17 00:00:00 2001 From: Maksim Andrianov Date: Wed, 12 Dec 2018 18:41:16 +0300 Subject: [PATCH] Review fixes --- tools/python/descriptions_downloader.py | 37 ++++++++++--------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py index a3eecf6ca7..1b759e67e9 100644 --- a/tools/python/descriptions_downloader.py +++ b/tools/python/descriptions_downloader.py @@ -27,6 +27,10 @@ BAD_SECTIONS = { def read_popularity(path): + """ + :param path: a path of popularity file. A file contains ',' rows. + :return: a set of popularity object ids + """ ids = set() for line in open(path): try: @@ -37,10 +41,10 @@ def read_popularity(path): return ids -def popularity_checker(popularity_set): - @functools.wraps(worker) +def should_download_wikipage(popularity_set): + @functools.wraps(popularity_set) def wrapped(ident): - return False if popularity_set is None else ident in popularity_set + return popularity_set is None or ident in popularity_set return wrapped @@ -66,25 +70,12 @@ def remove_bad_sections(soup, lang): return soup -def remove_empty_sections(soup): - prev = None - for x in soup.find_all(): - if prev is not None and x.name in HEADERS and prev.name == x.name: - prev.extract() - prev = x - - if prev is not None and prev.name in HEADERS: - prev.extract() - return soup - - def beautify_page(html, lang): soup = BeautifulSoup(html, "html") for x in soup.find_all(): if len(x.text.strip()) == 0: x.extract() - soup = remove_empty_sections(soup) soup = remove_bad_sections(soup, lang) html = str(soup.prettify()) html = htmlmin.minify(html, remove_empty_space=True) @@ -168,11 +159,11 @@ def worker(output_dir, checker, langs): return try: - splitted = line.rsplit("\t") - ident = int(splitted[1].strip()) - if checker(ident): + (mwm_path, ident, url) = line.split("\t") + ident = int(ident) + if not checker(ident): return - url = splitted[-1].strip() + url = url.strip() except (AttributeError, IndexError): log.exception(f"{line} is incorrect.") return @@ -187,7 +178,9 @@ def parse_args(): parser.add_argument("--o", metavar="PATH", type=str, help="Output dir for saving pages") parser.add_argument("--p", metavar="PATH", type=str, - help="Input popularity file.") + help="File with popular object ids for which we " + "download wikipedia data. If not given, download " + "for all objects.") parser.add_argument('--i', metavar="PATH", type=str, required=True, help="Input file with wikipedia url.") parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+', @@ -209,7 +202,7 @@ def main(): popularity_set = read_popularity(popularity_file) if popularity_file else None if popularity_set: log.info(f"Popularity set size: {len(popularity_set)}.") - checker = popularity_checker(popularity_set) + checker = should_download_wikipage(popularity_set) with open(input_file) as file: _ = file.readline() pool = ThreadPool(processes=WORKERS)