forked from organicmaps/organicmaps-tmp
[generator] Added optional langs and beautify_page
This commit is contained in:
parent
2bc0ca8a69
commit
db39821893
2 changed files with 103 additions and 23 deletions
|
@ -1,10 +1,14 @@
|
|||
import os
|
||||
import re
|
||||
import argparse
|
||||
import functools
|
||||
import logging
|
||||
import itertools
|
||||
import urllib.parse
|
||||
import wikipediaapi
|
||||
import htmlmin
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from bs4 import BeautifulSoup
|
||||
"""
|
||||
This script downloads Wikipedia pages for different languages.
|
||||
"""
|
||||
|
@ -13,8 +17,62 @@ log = logging.getLogger(__name__)
|
|||
WORKERS = 16
|
||||
CHUNK_SIZE = 64
|
||||
|
||||
HEADERS = {f"h{x}" for x in range(1,7)}
|
||||
BAD_SECTIONS = {
|
||||
"en": ["External links", "Sources", "See also", "Bibliography", "Further reading"],
|
||||
"ru": ["Литература", "Ссылки", "См. также"],
|
||||
"es": ["Vínculos de interés", "Véase también", "Enlaces externos"]
|
||||
|
||||
def download(dir, url):
|
||||
}
|
||||
|
||||
|
||||
def remove_bad_sections(soup, lang):
|
||||
if lang not in BAD_SECTIONS:
|
||||
return soup
|
||||
|
||||
it = iter(soup.find_all())
|
||||
current = next(it, None)
|
||||
while current is not None:
|
||||
if current.name in HEADERS and current.text.strip() in BAD_SECTIONS[lang]:
|
||||
current.extract()
|
||||
current = next(it, None)
|
||||
while current is not None:
|
||||
if current.name in HEADERS:
|
||||
break
|
||||
current.extract()
|
||||
current = next(it, None)
|
||||
else:
|
||||
current = next(it, None)
|
||||
return soup
|
||||
|
||||
|
||||
def remove_empty_sections(soup):
|
||||
prev = None
|
||||
for x in soup.find_all():
|
||||
if prev is not None and x.name in HEADERS and prev.name in HEADERS:
|
||||
prev.extract()
|
||||
prev = x
|
||||
return soup
|
||||
|
||||
|
||||
def beautify_page(html, lang):
|
||||
soup = BeautifulSoup(html, "html")
|
||||
for x in soup.find_all():
|
||||
if len(x.text.strip()) == 0:
|
||||
x.extract()
|
||||
|
||||
soup = remove_empty_sections(soup)
|
||||
soup = remove_bad_sections(soup, lang)
|
||||
html = str(soup.prettify())
|
||||
html = htmlmin.minify(html, remove_empty_space=True)
|
||||
return html
|
||||
|
||||
|
||||
def need_lang(lang, langs):
|
||||
return lang in langs if langs else True
|
||||
|
||||
|
||||
def download(directory, url):
|
||||
url = urllib.parse.unquote(url)
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
try:
|
||||
|
@ -22,7 +80,7 @@ def download(dir, url):
|
|||
except (AttributeError, IndexError):
|
||||
log.exception(f"{parsed.netloc} is incorrect.")
|
||||
return None
|
||||
path = os.path.join(dir, f"{lang}.html")
|
||||
path = os.path.join(directory, f"{lang}.html")
|
||||
if os.path.exists(path):
|
||||
log.warning(f"{path} already exists.")
|
||||
return None
|
||||
|
@ -37,34 +95,50 @@ def download(dir, url):
|
|||
text = page.text
|
||||
page_size = len(text)
|
||||
if page_size:
|
||||
references = "<h2>References</h2>"
|
||||
index = text.find(references)
|
||||
if index >= 0:
|
||||
text = text[:index] + text[index + len(references):]
|
||||
|
||||
text = beautify_page(text, lang)
|
||||
log.info(f"Save to {path} {lang} {page_name} {page_size}.")
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
with open(path, "w") as file:
|
||||
file.write(text)
|
||||
else:
|
||||
log.warning(f"Page {url} is empty. It has not been saved.")
|
||||
return page
|
||||
return text
|
||||
|
||||
|
||||
def download_all(path, url):
|
||||
page = download(path, url)
|
||||
if page is None:
|
||||
return
|
||||
def get_wiki_langs(url):
|
||||
url = urllib.parse.unquote(url)
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
try:
|
||||
lang_links = page.langlinks
|
||||
lang = parsed.netloc.split(".", maxsplit=1)[0]
|
||||
except (AttributeError, IndexError):
|
||||
log.exception(f"{parsed.netloc} is incorrect.")
|
||||
return None
|
||||
wiki = wikipediaapi.Wikipedia(language=lang,
|
||||
extract_format=wikipediaapi.ExtractFormat.HTML)
|
||||
try:
|
||||
page_name = parsed.path.rsplit("/", maxsplit=1)[-1]
|
||||
except (AttributeError, IndexError):
|
||||
log.exception(f"{parsed.path} is incorrect.")
|
||||
return None
|
||||
page = wiki.page(page_name)
|
||||
my_lang = [(lang, url), ]
|
||||
try:
|
||||
langlinks = page.langlinks
|
||||
return list(zip(langlinks.keys(),
|
||||
[link.fullurl for link in langlinks.values()])) + my_lang
|
||||
except KeyError as e:
|
||||
log.warning(f"No languages for {url} ({e}).")
|
||||
return
|
||||
|
||||
for link in lang_links.values():
|
||||
download(path, link.fullurl)
|
||||
return my_lang
|
||||
|
||||
|
||||
def worker(output_dir):
|
||||
def download_all(path, url, langs):
|
||||
available_langs = get_wiki_langs(url)
|
||||
available_langs = filter(lambda x: need_lang(x[0], langs), available_langs)
|
||||
for lang in available_langs:
|
||||
download(path, lang[1])
|
||||
|
||||
|
||||
def worker(output_dir, langs):
|
||||
@functools.wraps(worker)
|
||||
def wrapped(line):
|
||||
try:
|
||||
|
@ -75,17 +149,20 @@ def worker(output_dir):
|
|||
url = url.strip()
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
|
||||
os.makedirs(path, exist_ok=True)
|
||||
download_all(path, url)
|
||||
download_all(path, url, langs)
|
||||
return wrapped
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Download wiki pages.")
|
||||
parser.add_argument("o", metavar="PATH", type=str,
|
||||
parser.add_argument("--o", metavar="PATH", type=str,
|
||||
help="Output dir for saving pages")
|
||||
parser.add_argument('--i', metavar="PATH", type=str, required=True,
|
||||
help="Input file with wikipedia url.")
|
||||
parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+',
|
||||
action='append',
|
||||
help="Languages for pages. If left blank, pages in all "
|
||||
"available languages will be loaded.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
@ -95,11 +172,13 @@ def main():
|
|||
args = parse_args()
|
||||
input_file = args.i
|
||||
output_dir = args.o
|
||||
langs = list(itertools.chain.from_iterable(args.langs))
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
with open(input_file) as file:
|
||||
_ = file.readline()
|
||||
pool = ThreadPool(processes=WORKERS)
|
||||
pool.map(worker(output_dir), file, CHUNK_SIZE)
|
||||
pool.map(worker(output_dir, langs), file, CHUNK_SIZE)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
|
|
|
@ -575,9 +575,10 @@ if [ "$MODE" == "descriptions" ]; then
|
|||
URLS_PATH="$INTDIR/wiki_urls.txt"
|
||||
WIKI_PAGES_PATH="$INTDIR/descriptions"
|
||||
LOG="$LOG_PATH/descriptions.log"
|
||||
LANGS="en ru es"
|
||||
|
||||
"$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG
|
||||
$PYTHON36 $DESCRIPTIONS_DOWNLOADER --i="$URLS_PATH" "$WIKI_PAGES_PATH" 2>> $LOG
|
||||
$PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs "$LANGS" 2>> $LOG
|
||||
|
||||
for file in "$TARGET"/*.mwm; do
|
||||
if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then
|
||||
|
|
Loading…
Add table
Reference in a new issue