diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py index 1b759e67e9..28c82dd3bb 100644 --- a/tools/python/descriptions_downloader.py +++ b/tools/python/descriptions_downloader.py @@ -15,17 +15,25 @@ This script downloads Wikipedia pages for different languages. """ log = logging.getLogger(__name__) -WORKERS = 80 +WORKERS = 10 CHUNK_SIZE = 128 HEADERS = {f"h{x}" for x in range(1,7)} BAD_SECTIONS = { - "en": ["External links", "Sources", "See also", "Bibliography", "Further reading"], + "en": ["External links", "Sources", "See also", "Bibliography", "Further reading", "References"], "ru": ["Литература", "Ссылки", "См. также", "Библиография", "Примечания"], - "es": ["Vínculos de interés", "Véase también", "Enlaces externos"] + "es": ["Vínculos de interés", "Véase también", "Enlaces externos", "Referencias"] } +class ParseError(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + + def read_popularity(path): """ :param path: a path of popularity file. A file contains ',' rows. @@ -51,7 +59,6 @@ def should_download_wikipage(popularity_set): def remove_bad_sections(soup, lang): if lang not in BAD_SECTIONS: return soup - it = iter(soup.find_all()) current = next(it, None) current_header_level = None @@ -75,7 +82,6 @@ def beautify_page(html, lang): for x in soup.find_all(): if len(x.text.strip()) == 0: x.extract() - soup = remove_bad_sections(soup, lang) html = str(soup.prettify()) html = htmlmin.minify(html, remove_empty_space=True) @@ -86,29 +92,44 @@ def need_lang(lang, langs): return lang in langs if langs else True -def download(directory, url): +def get_page_info(url): url = urllib.parse.unquote(url) parsed = urllib.parse.urlparse(url) try: lang = parsed.netloc.split(".", maxsplit=1)[0] except (AttributeError, IndexError): - log.exception(f"{parsed.netloc} is incorrect.") + raise ParseError(f"{parsed.netloc} is incorrect.") + try: + page_name = parsed.path.rsplit("/", maxsplit=1)[-1] + except (AttributeError, IndexError): + raise ParseError(f"{parsed.path} is incorrect.") + return lang, page_name + + +def get_wiki_page(lang, page_name): + wiki = wikipediaapi.Wikipedia(language=lang, + extract_format=wikipediaapi.ExtractFormat.HTML) + return wiki.page(page_name) + + +def download(directory, url): + try: + lang, page_name = get_page_info(url) + except ParseError: + log.exception("Parsing failed. {url} is incorrect.") return None path = os.path.join(directory, f"{lang}.html") if os.path.exists(path): log.warning(f"{path} already exists.") return None + page = get_wiki_page(lang, page_name) try: - page_name = parsed.path.rsplit("/", maxsplit=1)[-1] - except (AttributeError, IndexError): - log.exception(f"{parsed.path} is incorrect.") + text = page.text + except KeyError: + log.exception(f"Error: page is not downloaded {page_name}.") return None - wiki = wikipediaapi.Wikipedia(language=lang, - extract_format=wikipediaapi.ExtractFormat.HTML) - page = wiki.page(page_name) - text = page.text page_size = len(text) - if page_size: + if page_size > 0: os.makedirs(directory, exist_ok=True) text = beautify_page(text, lang) log.info(f"Save to {path} {lang} {page_name} {page_size}.") @@ -120,33 +141,24 @@ def download(directory, url): def get_wiki_langs(url): - url = urllib.parse.unquote(url) - parsed = urllib.parse.urlparse(url) - try: - lang = parsed.netloc.split(".", maxsplit=1)[0] - except (AttributeError, IndexError): - log.exception(f"{parsed.netloc} is incorrect.") - return None - wiki = wikipediaapi.Wikipedia(language=lang, - extract_format=wikipediaapi.ExtractFormat.HTML) - try: - page_name = parsed.path.rsplit("/", maxsplit=1)[-1] - except (AttributeError, IndexError): - log.exception(f"{parsed.path} is incorrect.") - return None - page = wiki.page(page_name) - my_lang = [(lang, url), ] + lang, page_name = get_page_info(url) + page = get_wiki_page(lang, page_name) + curr_lang = [(lang, url), ] try: langlinks = page.langlinks return list(zip(langlinks.keys(), - [link.fullurl for link in langlinks.values()])) + my_lang + [link.fullurl for link in langlinks.values()])) + curr_lang except KeyError as e: log.warning(f"No languages for {url} ({e}).") - return my_lang + return curr_lang def download_all(path, url, langs): - available_langs = get_wiki_langs(url) + try: + available_langs = get_wiki_langs(url) + except ParseError: + log.exception("Parsing failed. {url} is incorrect.") + return available_langs = filter(lambda x: need_lang(x[0], langs), available_langs) for lang in available_langs: download(path, lang[1]) @@ -157,9 +169,8 @@ def worker(output_dir, checker, langs): def wrapped(line): if not line.strip(): return - try: - (mwm_path, ident, url) = line.split("\t") + mwm_path, ident, url = line.split("\t") ident = int(ident) if not checker(ident): return diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh index 1bbcada0f0..b828ba45c8 100755 --- a/tools/unix/generate_planet.sh +++ b/tools/unix/generate_planet.sh @@ -578,7 +578,7 @@ if [ "$MODE" == "descriptions" ]; then LANGS="en ru es" "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG - $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs "$LANGS" 2>> $LOG + $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs $LANGS 2>> $LOG for file in "$TARGET"/*.mwm; do if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then